contextqasv-tool

Running

App Files Files Community

zaldivards commited on Feb 12

Commit

13ce80f

1 Parent(s): b715dbd

feat: improve text splitting logic

Browse files

The splitting process now occurs in two steps:
first using the '\n' char, then refining the split using the '.' char.

Files changed (1) hide show

app.py +30 -16

app.py CHANGED Viewed

@@ -16,7 +16,16 @@ default_k = int(os.environ.get("DEFAULT_K", 5))
 model = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-base-es", trust_remote_code=True)
-headers = ("INDICE LEGISLATIVO", "ASAMBLEA LEGISLATIVA - REPUBLICA DE EL SALVADOR")
 docs = {}
@@ -117,24 +126,30 @@ def generate_chunks(text: str, max_length: int) -> list[str]:
         # replace Art. X. with Articulo X
         text = text.replace(match_result.group(), f"Articulo {match_result.group(1)} ")
-    # remove more than one line break, multiple underscores and unwanted headers or footers
-    text = re.sub(rf"(?<!\w)\n|_+|{headers[0]}|{headers[1]}", "", text)
     chunks = []
     chunk = ""
     art_prefix = ""
-    # split using period (.) but ignoring number such as 1.0, 2.000, etc
-    for current_segment in re.split(r"(?<!\d)\.", text):
-        # Attempt to normalize the current chunk by removing more than one consecutive space,
-        # while preserving single spaces within words
-        current_segment = re.sub(r"(?<!\w|[.,;]) +", " ", current_segment).strip()
-        if len(chunk) + len(current_segment) + 2 < max_length:
-            chunk += f". {current_segment}"
-            continue
-        chunk, art_prefix = add_prefix(chunk, art_prefix)
-        chunks.append(chunk.lower())
-        chunk = current_segment
     if chunk:
         chunk, _ = add_prefix(chunk, art_prefix)
         chunks.append(chunk.lower())
@@ -166,7 +181,6 @@ def predict(query: str, k: int = 5) -> str:
     """
     # Embed the query
     query_embedding = model.encode(query)
     # Initialize a list to store all chunks and their similarities across all documents
     all_chunks = []
     # Iterate through all documents

 model = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-base-es", trust_remote_code=True)
+replace_pairs = [
+    (r"¢\s+100.00", "$50"),
+    (r"¢\s+300.00", "$100"),
+    (r"¢\s+500.00", "$150"),
+    # Attempt to normalize the current chunk by removing more than one consecutive space,
+    # while preserving single spaces within words
+    (r"(?<!\w|[.,;]) +", " "),
+    # remove more than one line break, multiple underscores and unwanted headers or footers
+    (r"(?<!\w|[ .:])\n|_+|INDICE LEGISLATIVO|ASAMBLEA LEGISLATIVA \- REPUBLICA DE EL SALVADOR", ""),
+]
 docs = {}
         # replace Art. X. with Articulo X
         text = text.replace(match_result.group(), f"Articulo {match_result.group(1)} ")
+    for regex, new in replace_pairs:
+        text = re.sub(regex, new, text)
     chunks = []
     chunk = ""
     art_prefix = ""
+    for current_segment in text.split("\n"):
+        remaining = ""
+        if len(chunk) + len(current_segment) + 1 <= max_length:
+            chunk += f" {current_segment}"
+        else:
+            remaining = current_segment
+            # split using period (.) but ignoring number such as 1.0, 2.000, etc
+            for idx, little_segment in enumerate(re.split(r"(?<!\d)\.", remaining)):
+                if len(chunk) + len(little_segment) + 2 <= max_length:
+                    remaining = remaining.removeprefix(f"{little_segment}.")
+                    chunk += f"{'.' if idx > 0 else ''} {little_segment}"
+                else:
+                    break
+        if remaining:
+            chunk, art_prefix = add_prefix(chunk, art_prefix)
+            chunks.append(chunk.lower())
+            chunk = remaining
     if chunk:
         chunk, _ = add_prefix(chunk, art_prefix)
         chunks.append(chunk.lower())
     """
     # Embed the query
     query_embedding = model.encode(query)
     # Initialize a list to store all chunks and their similarities across all documents
     all_chunks = []
     # Iterate through all documents