zaldivards commited on
Commit
13ce80f
·
1 Parent(s): b715dbd

feat: improve text splitting logic

Browse files

The splitting process now occurs in two steps:
first using the '\n' char, then refining the split using the '.' char.

Files changed (1) hide show
  1. app.py +30 -16
app.py CHANGED
@@ -16,7 +16,16 @@ default_k = int(os.environ.get("DEFAULT_K", 5))
16
 
17
  model = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-base-es", trust_remote_code=True)
18
 
19
- headers = ("INDICE LEGISLATIVO", "ASAMBLEA LEGISLATIVA - REPUBLICA DE EL SALVADOR")
 
 
 
 
 
 
 
 
 
20
 
21
  docs = {}
22
 
@@ -117,24 +126,30 @@ def generate_chunks(text: str, max_length: int) -> list[str]:
117
  # replace Art. X. with Articulo X
118
  text = text.replace(match_result.group(), f"Articulo {match_result.group(1)} ")
119
 
120
- # remove more than one line break, multiple underscores and unwanted headers or footers
121
- text = re.sub(rf"(?<!\w)\n|_+|{headers[0]}|{headers[1]}", "", text)
 
122
  chunks = []
123
  chunk = ""
124
 
125
  art_prefix = ""
126
- # split using period (.) but ignoring number such as 1.0, 2.000, etc
127
- for current_segment in re.split(r"(?<!\d)\.", text):
128
- # Attempt to normalize the current chunk by removing more than one consecutive space,
129
- # while preserving single spaces within words
130
- current_segment = re.sub(r"(?<!\w|[.,;]) +", " ", current_segment).strip()
131
-
132
- if len(chunk) + len(current_segment) + 2 < max_length:
133
- chunk += f". {current_segment}"
134
- continue
135
- chunk, art_prefix = add_prefix(chunk, art_prefix)
136
- chunks.append(chunk.lower())
137
- chunk = current_segment
 
 
 
 
 
138
  if chunk:
139
  chunk, _ = add_prefix(chunk, art_prefix)
140
  chunks.append(chunk.lower())
@@ -166,7 +181,6 @@ def predict(query: str, k: int = 5) -> str:
166
  """
167
  # Embed the query
168
  query_embedding = model.encode(query)
169
-
170
  # Initialize a list to store all chunks and their similarities across all documents
171
  all_chunks = []
172
  # Iterate through all documents
 
16
 
17
  model = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-base-es", trust_remote_code=True)
18
 
19
+ replace_pairs = [
20
+ (r"¢\s+100.00", "$50"),
21
+ (r"¢\s+300.00", "$100"),
22
+ (r"¢\s+500.00", "$150"),
23
+ # Attempt to normalize the current chunk by removing more than one consecutive space,
24
+ # while preserving single spaces within words
25
+ (r"(?<!\w|[.,;]) +", " "),
26
+ # remove more than one line break, multiple underscores and unwanted headers or footers
27
+ (r"(?<!\w|[ .:])\n|_+|INDICE LEGISLATIVO|ASAMBLEA LEGISLATIVA \- REPUBLICA DE EL SALVADOR", ""),
28
+ ]
29
 
30
  docs = {}
31
 
 
126
  # replace Art. X. with Articulo X
127
  text = text.replace(match_result.group(), f"Articulo {match_result.group(1)} ")
128
 
129
+ for regex, new in replace_pairs:
130
+ text = re.sub(regex, new, text)
131
+
132
  chunks = []
133
  chunk = ""
134
 
135
  art_prefix = ""
136
+ for current_segment in text.split("\n"):
137
+ remaining = ""
138
+ if len(chunk) + len(current_segment) + 1 <= max_length:
139
+ chunk += f" {current_segment}"
140
+ else:
141
+ remaining = current_segment
142
+ # split using period (.) but ignoring number such as 1.0, 2.000, etc
143
+ for idx, little_segment in enumerate(re.split(r"(?<!\d)\.", remaining)):
144
+ if len(chunk) + len(little_segment) + 2 <= max_length:
145
+ remaining = remaining.removeprefix(f"{little_segment}.")
146
+ chunk += f"{'.' if idx > 0 else ''} {little_segment}"
147
+ else:
148
+ break
149
+ if remaining:
150
+ chunk, art_prefix = add_prefix(chunk, art_prefix)
151
+ chunks.append(chunk.lower())
152
+ chunk = remaining
153
  if chunk:
154
  chunk, _ = add_prefix(chunk, art_prefix)
155
  chunks.append(chunk.lower())
 
181
  """
182
  # Embed the query
183
  query_embedding = model.encode(query)
 
184
  # Initialize a list to store all chunks and their similarities across all documents
185
  all_chunks = []
186
  # Iterate through all documents