contextqasv-tool

Running

zaldivards commited on Feb 11

Commit

b715dbd

1 Parent(s): f713e02

feat: further chunk normalization

- Normalize article names
- Remove more than one consecutive spaces
- Add a prefix identifier to chunks that are
article continuations

Files changed (2) hide show

app.py +51 -12
sources/Constitucion de la Republica.pdf +0 -0

app.py CHANGED Viewed

@@ -11,11 +11,13 @@ from pypdf import PdfReader
 from transformers import AutoModel
-chunk_size = int(os.environ.get("CHUNK_SIZE", 1000))
 default_k = int(os.environ.get("DEFAULT_K", 5))
 model = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-base-es", trust_remote_code=True)
 docs = {}
@@ -66,9 +68,35 @@ def convert(filename: str) -> str:
     raise ValueError(f"Unsupported file type: {filename}")
 def generate_chunks(text: str, max_length: int) -> list[str]:
     """Generate chunks from a file's raw text. Chunks are calculated based
-    on the `max_lenght` parameter and the split character (.)
     Parameters
     ----------
@@ -76,7 +104,7 @@ def generate_chunks(text: str, max_length: int) -> list[str]:
         The raw text
     max_length : int
         Maximum number of characters a chunk can have. Note that chunks
-        may not have this exact lenght, as another component is also
         involved in the splitting process
     Returns
@@ -85,20 +113,31 @@ def generate_chunks(text: str, max_length: int) -> list[str]:
         A list of chunks/nodes
     """
-    segments = text.split(".")
     chunks = []
     chunk = ""
-    for current_segment in segments:
-        # try to normalize the current chunk
-        current_segment = re.sub(r"\s+", " ", current_segment).strip()
-        if len(chunk) < max_length:
             chunk += f". {current_segment}"
-        else:
-            chunks.append(chunk)
-            chunk = current_segment
     if chunk:
-        chunks.append(chunk)
     return chunks

 from transformers import AutoModel
+chunk_size = int(os.environ.get("CHUNK_SIZE", 250))
 default_k = int(os.environ.get("DEFAULT_K", 5))
 model = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-base-es", trust_remote_code=True)
+headers = ("INDICE LEGISLATIVO", "ASAMBLEA LEGISLATIVA - REPUBLICA DE EL SALVADOR")
 docs = {}
     raise ValueError(f"Unsupported file type: {filename}")
+def add_prefix(chunk: str, art_prefix: str) -> tuple[str, str]:
+    """Add prefix to chunks that are continuation of a certain article
+    Parameters
+    ----------
+    chunk : str
+        original chunk
+    art_prefix : str
+        current prefix
+    Returns
+    -------
+    tuple[str, str]
+        The updated chunk and the new prefix
+    """
+    results = re.findall(r"(Articulo \d+)\s+-", chunk)
+    ignore_results = False
+    if (len(results) == 1 and chunk.find(results[0]) > 4 and art_prefix) or not results:
+        results.insert(0, art_prefix)
+    elif len(results) == 1 and chunk.find(results[0]) <= 4:
+        ignore_results = True
+    art_prefix = results[-1]
+    # if the current chunk is a continuation of a certain article, an identifier prefix will be added to it
+    return (f"<<{'|'.join(results)}>>{chunk}" if results and not ignore_results else chunk), art_prefix
 def generate_chunks(text: str, max_length: int) -> list[str]:
     """Generate chunks from a file's raw text. Chunks are calculated based
+    on the `max_length` parameter and the split character (.)
     Parameters
     ----------
         The raw text
     max_length : int
         Maximum number of characters a chunk can have. Note that chunks
+        may not have this exact length, as another component is also
         involved in the splitting process
     Returns
         A list of chunks/nodes
     """
+    for match_result in re.finditer(r"Art\. (\d+)\.", text):
+        # replace Art. X. with Articulo X
+        text = text.replace(match_result.group(), f"Articulo {match_result.group(1)} ")
+    # remove more than one line break, multiple underscores and unwanted headers or footers
+    text = re.sub(rf"(?<!\w)\n|_+|{headers[0]}|{headers[1]}", "", text)
     chunks = []
     chunk = ""
+    art_prefix = ""
+    # split using period (.) but ignoring number such as 1.0, 2.000, etc
+    for current_segment in re.split(r"(?<!\d)\.", text):
+        # Attempt to normalize the current chunk by removing more than one consecutive space,
+        # while preserving single spaces within words
+        current_segment = re.sub(r"(?<!\w|[.,;]) +", " ", current_segment).strip()
+        if len(chunk) + len(current_segment) + 2 < max_length:
             chunk += f". {current_segment}"
+            continue
+        chunk, art_prefix = add_prefix(chunk, art_prefix)
+        chunks.append(chunk.lower())
+        chunk = current_segment
     if chunk:
+        chunk, _ = add_prefix(chunk, art_prefix)
+        chunks.append(chunk.lower())
     return chunks

sources/Constitucion de la Republica.pdf CHANGED Viewed

Binary files a/sources/Constitucion de la Republica.pdf and b/sources/Constitucion de la Republica.pdf differ