Spaces:
Running
Running
Commit
·
b715dbd
1
Parent(s):
f713e02
feat: further chunk normalization
Browse files- Normalize article names
- Remove more than one consecutive spaces
- Add a prefix identifier to chunks that are
article continuations
- app.py +51 -12
- sources/Constitucion de la Republica.pdf +0 -0
app.py
CHANGED
@@ -11,11 +11,13 @@ from pypdf import PdfReader
|
|
11 |
from transformers import AutoModel
|
12 |
|
13 |
|
14 |
-
chunk_size = int(os.environ.get("CHUNK_SIZE",
|
15 |
default_k = int(os.environ.get("DEFAULT_K", 5))
|
16 |
|
17 |
model = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-base-es", trust_remote_code=True)
|
18 |
|
|
|
|
|
19 |
docs = {}
|
20 |
|
21 |
|
@@ -66,9 +68,35 @@ def convert(filename: str) -> str:
|
|
66 |
raise ValueError(f"Unsupported file type: {filename}")
|
67 |
|
68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
def generate_chunks(text: str, max_length: int) -> list[str]:
|
70 |
"""Generate chunks from a file's raw text. Chunks are calculated based
|
71 |
-
on the `
|
72 |
|
73 |
Parameters
|
74 |
----------
|
@@ -76,7 +104,7 @@ def generate_chunks(text: str, max_length: int) -> list[str]:
|
|
76 |
The raw text
|
77 |
max_length : int
|
78 |
Maximum number of characters a chunk can have. Note that chunks
|
79 |
-
may not have this exact
|
80 |
involved in the splitting process
|
81 |
|
82 |
Returns
|
@@ -85,20 +113,31 @@ def generate_chunks(text: str, max_length: int) -> list[str]:
|
|
85 |
A list of chunks/nodes
|
86 |
"""
|
87 |
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
89 |
chunks = []
|
90 |
chunk = ""
|
91 |
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
|
|
|
|
|
|
|
|
96 |
chunk += f". {current_segment}"
|
97 |
-
|
98 |
-
|
99 |
-
|
|
|
100 |
if chunk:
|
101 |
-
|
|
|
102 |
return chunks
|
103 |
|
104 |
|
|
|
11 |
from transformers import AutoModel
|
12 |
|
13 |
|
14 |
+
chunk_size = int(os.environ.get("CHUNK_SIZE", 250))
|
15 |
default_k = int(os.environ.get("DEFAULT_K", 5))
|
16 |
|
17 |
model = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-base-es", trust_remote_code=True)
|
18 |
|
19 |
+
headers = ("INDICE LEGISLATIVO", "ASAMBLEA LEGISLATIVA - REPUBLICA DE EL SALVADOR")
|
20 |
+
|
21 |
docs = {}
|
22 |
|
23 |
|
|
|
68 |
raise ValueError(f"Unsupported file type: {filename}")
|
69 |
|
70 |
|
71 |
+
def add_prefix(chunk: str, art_prefix: str) -> tuple[str, str]:
|
72 |
+
"""Add prefix to chunks that are continuation of a certain article
|
73 |
+
|
74 |
+
Parameters
|
75 |
+
----------
|
76 |
+
chunk : str
|
77 |
+
original chunk
|
78 |
+
art_prefix : str
|
79 |
+
current prefix
|
80 |
+
|
81 |
+
Returns
|
82 |
+
-------
|
83 |
+
tuple[str, str]
|
84 |
+
The updated chunk and the new prefix
|
85 |
+
"""
|
86 |
+
results = re.findall(r"(Articulo \d+)\s+-", chunk)
|
87 |
+
ignore_results = False
|
88 |
+
if (len(results) == 1 and chunk.find(results[0]) > 4 and art_prefix) or not results:
|
89 |
+
results.insert(0, art_prefix)
|
90 |
+
elif len(results) == 1 and chunk.find(results[0]) <= 4:
|
91 |
+
ignore_results = True
|
92 |
+
art_prefix = results[-1]
|
93 |
+
# if the current chunk is a continuation of a certain article, an identifier prefix will be added to it
|
94 |
+
return (f"<<{'|'.join(results)}>>{chunk}" if results and not ignore_results else chunk), art_prefix
|
95 |
+
|
96 |
+
|
97 |
def generate_chunks(text: str, max_length: int) -> list[str]:
|
98 |
"""Generate chunks from a file's raw text. Chunks are calculated based
|
99 |
+
on the `max_length` parameter and the split character (.)
|
100 |
|
101 |
Parameters
|
102 |
----------
|
|
|
104 |
The raw text
|
105 |
max_length : int
|
106 |
Maximum number of characters a chunk can have. Note that chunks
|
107 |
+
may not have this exact length, as another component is also
|
108 |
involved in the splitting process
|
109 |
|
110 |
Returns
|
|
|
113 |
A list of chunks/nodes
|
114 |
"""
|
115 |
|
116 |
+
for match_result in re.finditer(r"Art\. (\d+)\.", text):
|
117 |
+
# replace Art. X. with Articulo X
|
118 |
+
text = text.replace(match_result.group(), f"Articulo {match_result.group(1)} ")
|
119 |
+
|
120 |
+
# remove more than one line break, multiple underscores and unwanted headers or footers
|
121 |
+
text = re.sub(rf"(?<!\w)\n|_+|{headers[0]}|{headers[1]}", "", text)
|
122 |
chunks = []
|
123 |
chunk = ""
|
124 |
|
125 |
+
art_prefix = ""
|
126 |
+
# split using period (.) but ignoring number such as 1.0, 2.000, etc
|
127 |
+
for current_segment in re.split(r"(?<!\d)\.", text):
|
128 |
+
# Attempt to normalize the current chunk by removing more than one consecutive space,
|
129 |
+
# while preserving single spaces within words
|
130 |
+
current_segment = re.sub(r"(?<!\w|[.,;]) +", " ", current_segment).strip()
|
131 |
+
|
132 |
+
if len(chunk) + len(current_segment) + 2 < max_length:
|
133 |
chunk += f". {current_segment}"
|
134 |
+
continue
|
135 |
+
chunk, art_prefix = add_prefix(chunk, art_prefix)
|
136 |
+
chunks.append(chunk.lower())
|
137 |
+
chunk = current_segment
|
138 |
if chunk:
|
139 |
+
chunk, _ = add_prefix(chunk, art_prefix)
|
140 |
+
chunks.append(chunk.lower())
|
141 |
return chunks
|
142 |
|
143 |
|
sources/Constitucion de la Republica.pdf
CHANGED
Binary files a/sources/Constitucion de la Republica.pdf and b/sources/Constitucion de la Republica.pdf differ
|
|