Update search specification
Browse files
app.py
CHANGED
@@ -145,6 +145,62 @@ def get_scope(specification: str, version: str):
|
|
145 |
traceback.print_exception(e)
|
146 |
return "Not found (error)"
|
147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
class DocRequest(BaseModel):
|
149 |
doc_id: str
|
150 |
release: Optional[int] = None
|
@@ -167,13 +223,14 @@ class BatchDocResponse(BaseModel):
|
|
167 |
|
168 |
class KeywordRequest(BaseModel):
|
169 |
keywords: str
|
|
|
170 |
release: Optional[str] = None
|
171 |
wg: Optional[str] = None
|
172 |
spec_type: Optional[Literal["TS", "TR"]] = None
|
173 |
mode: Optional[Literal["and", "or"]] = "and"
|
174 |
|
175 |
class KeywordResponse(BaseModel):
|
176 |
-
results: List[Dict[str, str]]
|
177 |
search_time: float
|
178 |
|
179 |
class TsgDocFinder:
|
@@ -301,7 +358,9 @@ class SpecDocFinder:
|
|
301 |
def __init__(self):
|
302 |
self.chars = "0123456789abcdefghijklmnopqrstuvwxyz"
|
303 |
self.indexer_file = "indexed_specifications.json"
|
|
|
304 |
self.indexer_specs, self.indexer_scopes, self.last_indexer_date = self.load_indexer()
|
|
|
305 |
|
306 |
def load_indexer(self):
|
307 |
"""Load existing index if available"""
|
@@ -311,6 +370,31 @@ class SpecDocFinder:
|
|
311 |
return x["specs"], x["scopes"], x["last_indexed_date"]
|
312 |
return {}, {}, None
|
313 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
314 |
def save_indexer(self):
|
315 |
"""Save the updated index"""
|
316 |
self.last_indexer_date = today.strftime("%d/%m/%Y-%H:%M:%S")
|
@@ -364,16 +448,16 @@ async def main_menu():
|
|
364 |
@app.post("/search-spec", response_model=KeywordResponse)
|
365 |
def search_spec(request: KeywordRequest):
|
366 |
start_time = time.time()
|
367 |
-
|
|
|
|
|
368 |
results = []
|
369 |
|
370 |
for string, spec in finder_spec.indexer_specs.items():
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
if not any(kw in string.lower() for kw in kws):
|
376 |
-
continue
|
377 |
release = request.release
|
378 |
working_group = request.wg
|
379 |
spec_type = request.spec_type
|
@@ -385,7 +469,37 @@ def search_spec(request: KeywordRequest):
|
|
385 |
if spec_type is not None and spec["type"] != spec_type:
|
386 |
continue
|
387 |
|
388 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
389 |
if len(results) > 0:
|
390 |
return KeywordResponse(
|
391 |
results=results,
|
@@ -394,6 +508,39 @@ def search_spec(request: KeywordRequest):
|
|
394 |
else:
|
395 |
raise HTTPException(status_code=404, detail="Specifications not found")
|
396 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
397 |
# def search_spec(request: KeywordRequest):
|
398 |
# chars = "0123456789abcdefghijklmnopqrstuvwxyz"
|
399 |
# start_time = time.time()
|
@@ -473,8 +620,6 @@ def search_spec(request: KeywordRequest):
|
|
473 |
def find_document(request: DocRequest):
|
474 |
start_time = time.time()
|
475 |
finder = finder_tsg if request.doc_id[0].isalpha() else finder_spec
|
476 |
-
print(finder)
|
477 |
-
|
478 |
result = finder.search_document(request.doc_id, request.release)
|
479 |
|
480 |
if "not found" not in result and "Could not" not in result and "Unable" not in result:
|
|
|
145 |
traceback.print_exception(e)
|
146 |
return "Not found (error)"
|
147 |
|
148 |
+
def get_spec_content(specification: str, version: str):
|
149 |
+
text = get_text(specification, version)
|
150 |
+
forewords = []
|
151 |
+
for x in range(len(text)):
|
152 |
+
line = text[x]
|
153 |
+
if "Foreword" in line:
|
154 |
+
forewords.append(x)
|
155 |
+
if len(forewords) >= 2:
|
156 |
+
break
|
157 |
+
|
158 |
+
toc_brut = text[forewords[0]:forewords[1]]
|
159 |
+
chapters = []
|
160 |
+
for line in toc_brut:
|
161 |
+
x = line.split("\t")
|
162 |
+
if re.search(r"^\d+\t[\ \S]+", line):
|
163 |
+
chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
|
164 |
+
if re.search(r"^\d+\.\d+\t[\ \S]+", line):
|
165 |
+
chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
|
166 |
+
if re.search(r"^\d+\.\d+\.\d+\t[\ \S]+", line):
|
167 |
+
chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
|
168 |
+
if re.search(r"^\d+\.\d+\.\d+.\d+\t[\ \S]+", line):
|
169 |
+
chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
|
170 |
+
if re.search(r"^\d+\.\d+\.\d+.\d+.\d+\t[\ \S]+", line):
|
171 |
+
chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
|
172 |
+
|
173 |
+
real_toc_indexes = {}
|
174 |
+
|
175 |
+
for chapter in chapters:
|
176 |
+
try:
|
177 |
+
x = text.index(chapter)
|
178 |
+
real_toc_indexes[chapter] = x
|
179 |
+
except ValueError as e:
|
180 |
+
try:
|
181 |
+
number = chapter.split("\t")[0] + "\t"
|
182 |
+
for line in text[forewords[1]:]:
|
183 |
+
if number in line:
|
184 |
+
x = text.index(line)
|
185 |
+
real_toc_indexes[line] = x
|
186 |
+
break
|
187 |
+
except:
|
188 |
+
real_toc_indexes[chapter] = -float("inf")
|
189 |
+
|
190 |
+
document = {}
|
191 |
+
toc = list(real_toc_indexes.keys())
|
192 |
+
index_toc = list(real_toc_indexes.values())
|
193 |
+
curr_index = 0
|
194 |
+
for x in range(1, len(toc)):
|
195 |
+
document[toc[curr_index].replace("\t", " ")] = re.sub(r"[\ \t]+", " ", "\n".join(text[index_toc[curr_index]+1:index_toc[x]]))
|
196 |
+
curr_index = x
|
197 |
+
|
198 |
+
document[toc[curr_index].replace("\t"," ")] = re.sub(r"\s+", " ", " ".join(text[index_toc[curr_index]+1:]))
|
199 |
+
return document
|
200 |
+
|
201 |
+
def willLower(string: str, lowered: bool):
|
202 |
+
return string.lower() if lowered else string
|
203 |
+
|
204 |
class DocRequest(BaseModel):
|
205 |
doc_id: str
|
206 |
release: Optional[int] = None
|
|
|
223 |
|
224 |
class KeywordRequest(BaseModel):
|
225 |
keywords: str
|
226 |
+
case_sensitive: Optional[bool] = False
|
227 |
release: Optional[str] = None
|
228 |
wg: Optional[str] = None
|
229 |
spec_type: Optional[Literal["TS", "TR"]] = None
|
230 |
mode: Optional[Literal["and", "or"]] = "and"
|
231 |
|
232 |
class KeywordResponse(BaseModel):
|
233 |
+
results: List[Dict[str, str|dict]]
|
234 |
search_time: float
|
235 |
|
236 |
class TsgDocFinder:
|
|
|
358 |
def __init__(self):
|
359 |
self.chars = "0123456789abcdefghijklmnopqrstuvwxyz"
|
360 |
self.indexer_file = "indexed_specifications.json"
|
361 |
+
self.doc_zip = "indexed_docs_content.zip"
|
362 |
self.indexer_specs, self.indexer_scopes, self.last_indexer_date = self.load_indexer()
|
363 |
+
self.indexer_documents = self.load_documents()
|
364 |
|
365 |
def load_indexer(self):
|
366 |
"""Load existing index if available"""
|
|
|
370 |
return x["specs"], x["scopes"], x["last_indexed_date"]
|
371 |
return {}, {}, None
|
372 |
|
373 |
+
def load_documents(self):
|
374 |
+
if os.path.exists(self.doc_zip):
|
375 |
+
with zipfile.ZipFile(open(self.doc_zip, "rb")) as zf:
|
376 |
+
for file_name in zf.namelist():
|
377 |
+
if file_name.endswith(".json"):
|
378 |
+
doc_bytes = zf.read(file_name)
|
379 |
+
try:
|
380 |
+
doc_data = json.loads(doc_bytes.decode("utf-8"))
|
381 |
+
print("Documents loaded successfully !")
|
382 |
+
return doc_data
|
383 |
+
except json.JSONDecodeError as e:
|
384 |
+
print(f"Error while decoding the JSON file {file_name}: {e}")
|
385 |
+
print("Failed !")
|
386 |
+
return {}
|
387 |
+
|
388 |
+
def get_document(self, spec, version):
|
389 |
+
doc = self.indexer_documents.get(spec)
|
390 |
+
if doc:
|
391 |
+
return doc
|
392 |
+
else:
|
393 |
+
return get_spec_content(spec, version)
|
394 |
+
|
395 |
+
def get_section(self, doc, chapter):
|
396 |
+
return doc[chapter]
|
397 |
+
|
398 |
def save_indexer(self):
|
399 |
"""Save the updated index"""
|
400 |
self.last_indexer_date = today.strftime("%d/%m/%Y-%H:%M:%S")
|
|
|
448 |
@app.post("/search-spec", response_model=KeywordResponse)
|
449 |
def search_spec(request: KeywordRequest):
|
450 |
start_time = time.time()
|
451 |
+
booleanLowered = request.case_sensitive
|
452 |
+
kws = [willLower(_, booleanLowered) for _ in request.keywords.split(" ")]
|
453 |
+
unique_specs = set()
|
454 |
results = []
|
455 |
|
456 |
for string, spec in finder_spec.indexer_specs.items():
|
457 |
+
put = False
|
458 |
+
if spec['id'] in unique_specs:
|
459 |
+
continue
|
460 |
+
|
|
|
|
|
461 |
release = request.release
|
462 |
working_group = request.wg
|
463 |
spec_type = request.spec_type
|
|
|
469 |
if spec_type is not None and spec["type"] != spec_type:
|
470 |
continue
|
471 |
|
472 |
+
contents = []
|
473 |
+
version = finder_spec.search_document(spec['id'], spec['release']).split("/")[-1].replace(".zip", "").split("-")[-1]
|
474 |
+
doc = finder_spec.get_document(spec['id'], version)
|
475 |
+
docValid = not isinstance(doc, str)
|
476 |
+
|
477 |
+
if request.mode == "and":
|
478 |
+
if all(kw in willLower(string, booleanLowered).split("+-+") for kw in kws):
|
479 |
+
put = True
|
480 |
+
if docValid:
|
481 |
+
for chapter in list(doc.keys())[1:]:
|
482 |
+
if "references" not in chapter.lower():
|
483 |
+
if all(kw in willLower(doc[chapter], booleanLowered).split(" ") for kw in kws):
|
484 |
+
put = True
|
485 |
+
contents.append(chapter)
|
486 |
+
elif request.mode == "or":
|
487 |
+
if any(kw in willLower(string, booleanLowered).split("+-+") for kw in kws):
|
488 |
+
put = True
|
489 |
+
if docValid:
|
490 |
+
for chapter in list(doc.keys())[1:]:
|
491 |
+
if "references" not in chapter.lower():
|
492 |
+
if any(kw in willLower(doc[chapter], booleanLowered).split(" ") for kw in kws):
|
493 |
+
put = True
|
494 |
+
contents.append(chapter)
|
495 |
+
|
496 |
+
if put:
|
497 |
+
spec_content = spec
|
498 |
+
spec_content["contains"] = {chap: doc[chap] for chap in contents}
|
499 |
+
|
500 |
+
results.append(spec_content)
|
501 |
+
else:
|
502 |
+
unique_specs.add(spec['id'])
|
503 |
if len(results) > 0:
|
504 |
return KeywordResponse(
|
505 |
results=results,
|
|
|
508 |
else:
|
509 |
raise HTTPException(status_code=404, detail="Specifications not found")
|
510 |
|
511 |
+
# @app.post("/search-spec", response_model=KeywordResponse)
|
512 |
+
# def search_spec(request: KeywordRequest):
|
513 |
+
# start_time = time.time()
|
514 |
+
# kws = [_.lower() for _ in request.keywords.split(" ")]
|
515 |
+
# results = []
|
516 |
+
|
517 |
+
# for string, spec in finder_spec.indexer_specs.items():
|
518 |
+
# if request.mode == "and":
|
519 |
+
# if not all(kw in string.lower() for kw in kws):
|
520 |
+
# continue
|
521 |
+
# elif request.mode == "or":
|
522 |
+
# if not any(kw in string.lower() for kw in kws):
|
523 |
+
# continue
|
524 |
+
# release = request.release
|
525 |
+
# working_group = request.wg
|
526 |
+
# spec_type = request.spec_type
|
527 |
+
|
528 |
+
# if spec.get('version', None) is None or (release is not None and spec["version"].split(".")[0] != str(release)):
|
529 |
+
# continue
|
530 |
+
# if spec.get('working_group', None) is None or (working_group is not None and spec["working_group"] != working_group):
|
531 |
+
# continue
|
532 |
+
# if spec_type is not None and spec["type"] != spec_type:
|
533 |
+
# continue
|
534 |
+
|
535 |
+
# results.append(spec)
|
536 |
+
# if len(results) > 0:
|
537 |
+
# return KeywordResponse(
|
538 |
+
# results=results,
|
539 |
+
# search_time=time.time() - start_time
|
540 |
+
# )
|
541 |
+
# else:
|
542 |
+
# raise HTTPException(status_code=404, detail="Specifications not found")
|
543 |
+
|
544 |
# def search_spec(request: KeywordRequest):
|
545 |
# chars = "0123456789abcdefghijklmnopqrstuvwxyz"
|
546 |
# start_time = time.time()
|
|
|
620 |
def find_document(request: DocRequest):
|
621 |
start_time = time.time()
|
622 |
finder = finder_tsg if request.doc_id[0].isalpha() else finder_spec
|
|
|
|
|
623 |
result = finder.search_document(request.doc_id, request.release)
|
624 |
|
625 |
if "not found" not in result and "Could not" not in result and "Unable" not in result:
|