Spaces:
Running
Running
Commit
·
13ce80f
1
Parent(s):
b715dbd
feat: improve text splitting logic
Browse filesThe splitting process now occurs in two steps:
first using the '\n' char, then refining the split using the '.' char.
app.py
CHANGED
@@ -16,7 +16,16 @@ default_k = int(os.environ.get("DEFAULT_K", 5))
|
|
16 |
|
17 |
model = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-base-es", trust_remote_code=True)
|
18 |
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
docs = {}
|
22 |
|
@@ -117,24 +126,30 @@ def generate_chunks(text: str, max_length: int) -> list[str]:
|
|
117 |
# replace Art. X. with Articulo X
|
118 |
text = text.replace(match_result.group(), f"Articulo {match_result.group(1)} ")
|
119 |
|
120 |
-
|
121 |
-
|
|
|
122 |
chunks = []
|
123 |
chunk = ""
|
124 |
|
125 |
art_prefix = ""
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
|
|
|
|
|
|
|
|
|
|
138 |
if chunk:
|
139 |
chunk, _ = add_prefix(chunk, art_prefix)
|
140 |
chunks.append(chunk.lower())
|
@@ -166,7 +181,6 @@ def predict(query: str, k: int = 5) -> str:
|
|
166 |
"""
|
167 |
# Embed the query
|
168 |
query_embedding = model.encode(query)
|
169 |
-
|
170 |
# Initialize a list to store all chunks and their similarities across all documents
|
171 |
all_chunks = []
|
172 |
# Iterate through all documents
|
|
|
16 |
|
17 |
model = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-base-es", trust_remote_code=True)
|
18 |
|
19 |
+
replace_pairs = [
|
20 |
+
(r"¢\s+100.00", "$50"),
|
21 |
+
(r"¢\s+300.00", "$100"),
|
22 |
+
(r"¢\s+500.00", "$150"),
|
23 |
+
# Attempt to normalize the current chunk by removing more than one consecutive space,
|
24 |
+
# while preserving single spaces within words
|
25 |
+
(r"(?<!\w|[.,;]) +", " "),
|
26 |
+
# remove more than one line break, multiple underscores and unwanted headers or footers
|
27 |
+
(r"(?<!\w|[ .:])\n|_+|INDICE LEGISLATIVO|ASAMBLEA LEGISLATIVA \- REPUBLICA DE EL SALVADOR", ""),
|
28 |
+
]
|
29 |
|
30 |
docs = {}
|
31 |
|
|
|
126 |
# replace Art. X. with Articulo X
|
127 |
text = text.replace(match_result.group(), f"Articulo {match_result.group(1)} ")
|
128 |
|
129 |
+
for regex, new in replace_pairs:
|
130 |
+
text = re.sub(regex, new, text)
|
131 |
+
|
132 |
chunks = []
|
133 |
chunk = ""
|
134 |
|
135 |
art_prefix = ""
|
136 |
+
for current_segment in text.split("\n"):
|
137 |
+
remaining = ""
|
138 |
+
if len(chunk) + len(current_segment) + 1 <= max_length:
|
139 |
+
chunk += f" {current_segment}"
|
140 |
+
else:
|
141 |
+
remaining = current_segment
|
142 |
+
# split using period (.) but ignoring number such as 1.0, 2.000, etc
|
143 |
+
for idx, little_segment in enumerate(re.split(r"(?<!\d)\.", remaining)):
|
144 |
+
if len(chunk) + len(little_segment) + 2 <= max_length:
|
145 |
+
remaining = remaining.removeprefix(f"{little_segment}.")
|
146 |
+
chunk += f"{'.' if idx > 0 else ''} {little_segment}"
|
147 |
+
else:
|
148 |
+
break
|
149 |
+
if remaining:
|
150 |
+
chunk, art_prefix = add_prefix(chunk, art_prefix)
|
151 |
+
chunks.append(chunk.lower())
|
152 |
+
chunk = remaining
|
153 |
if chunk:
|
154 |
chunk, _ = add_prefix(chunk, art_prefix)
|
155 |
chunks.append(chunk.lower())
|
|
|
181 |
"""
|
182 |
# Embed the query
|
183 |
query_embedding = model.encode(query)
|
|
|
184 |
# Initialize a list to store all chunks and their similarities across all documents
|
185 |
all_chunks = []
|
186 |
# Iterate through all documents
|