om4r932 commited on
Commit
5110615
·
1 Parent(s): 1de4995

Update search specification

Browse files
Files changed (1) hide show
  1. app.py +156 -11
app.py CHANGED
@@ -145,6 +145,62 @@ def get_scope(specification: str, version: str):
145
  traceback.print_exception(e)
146
  return "Not found (error)"
147
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  class DocRequest(BaseModel):
149
  doc_id: str
150
  release: Optional[int] = None
@@ -167,13 +223,14 @@ class BatchDocResponse(BaseModel):
167
 
168
  class KeywordRequest(BaseModel):
169
  keywords: str
 
170
  release: Optional[str] = None
171
  wg: Optional[str] = None
172
  spec_type: Optional[Literal["TS", "TR"]] = None
173
  mode: Optional[Literal["and", "or"]] = "and"
174
 
175
  class KeywordResponse(BaseModel):
176
- results: List[Dict[str, str]]
177
  search_time: float
178
 
179
  class TsgDocFinder:
@@ -301,7 +358,9 @@ class SpecDocFinder:
301
  def __init__(self):
302
  self.chars = "0123456789abcdefghijklmnopqrstuvwxyz"
303
  self.indexer_file = "indexed_specifications.json"
 
304
  self.indexer_specs, self.indexer_scopes, self.last_indexer_date = self.load_indexer()
 
305
 
306
  def load_indexer(self):
307
  """Load existing index if available"""
@@ -311,6 +370,31 @@ class SpecDocFinder:
311
  return x["specs"], x["scopes"], x["last_indexed_date"]
312
  return {}, {}, None
313
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  def save_indexer(self):
315
  """Save the updated index"""
316
  self.last_indexer_date = today.strftime("%d/%m/%Y-%H:%M:%S")
@@ -364,16 +448,16 @@ async def main_menu():
364
  @app.post("/search-spec", response_model=KeywordResponse)
365
  def search_spec(request: KeywordRequest):
366
  start_time = time.time()
367
- kws = [_.lower() for _ in request.keywords.split(" ")]
 
 
368
  results = []
369
 
370
  for string, spec in finder_spec.indexer_specs.items():
371
- if request.mode == "and":
372
- if not all(kw in string.lower() for kw in kws):
373
- continue
374
- elif request.mode == "or":
375
- if not any(kw in string.lower() for kw in kws):
376
- continue
377
  release = request.release
378
  working_group = request.wg
379
  spec_type = request.spec_type
@@ -385,7 +469,37 @@ def search_spec(request: KeywordRequest):
385
  if spec_type is not None and spec["type"] != spec_type:
386
  continue
387
 
388
- results.append(spec)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
  if len(results) > 0:
390
  return KeywordResponse(
391
  results=results,
@@ -394,6 +508,39 @@ def search_spec(request: KeywordRequest):
394
  else:
395
  raise HTTPException(status_code=404, detail="Specifications not found")
396
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
  # def search_spec(request: KeywordRequest):
398
  # chars = "0123456789abcdefghijklmnopqrstuvwxyz"
399
  # start_time = time.time()
@@ -473,8 +620,6 @@ def search_spec(request: KeywordRequest):
473
  def find_document(request: DocRequest):
474
  start_time = time.time()
475
  finder = finder_tsg if request.doc_id[0].isalpha() else finder_spec
476
- print(finder)
477
-
478
  result = finder.search_document(request.doc_id, request.release)
479
 
480
  if "not found" not in result and "Could not" not in result and "Unable" not in result:
 
145
  traceback.print_exception(e)
146
  return "Not found (error)"
147
 
148
+ def get_spec_content(specification: str, version: str):
149
+ text = get_text(specification, version)
150
+ forewords = []
151
+ for x in range(len(text)):
152
+ line = text[x]
153
+ if "Foreword" in line:
154
+ forewords.append(x)
155
+ if len(forewords) >= 2:
156
+ break
157
+
158
+ toc_brut = text[forewords[0]:forewords[1]]
159
+ chapters = []
160
+ for line in toc_brut:
161
+ x = line.split("\t")
162
+ if re.search(r"^\d+\t[\ \S]+", line):
163
+ chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
164
+ if re.search(r"^\d+\.\d+\t[\ \S]+", line):
165
+ chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
166
+ if re.search(r"^\d+\.\d+\.\d+\t[\ \S]+", line):
167
+ chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
168
+ if re.search(r"^\d+\.\d+\.\d+.\d+\t[\ \S]+", line):
169
+ chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
170
+ if re.search(r"^\d+\.\d+\.\d+.\d+.\d+\t[\ \S]+", line):
171
+ chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
172
+
173
+ real_toc_indexes = {}
174
+
175
+ for chapter in chapters:
176
+ try:
177
+ x = text.index(chapter)
178
+ real_toc_indexes[chapter] = x
179
+ except ValueError as e:
180
+ try:
181
+ number = chapter.split("\t")[0] + "\t"
182
+ for line in text[forewords[1]:]:
183
+ if number in line:
184
+ x = text.index(line)
185
+ real_toc_indexes[line] = x
186
+ break
187
+ except:
188
+ real_toc_indexes[chapter] = -float("inf")
189
+
190
+ document = {}
191
+ toc = list(real_toc_indexes.keys())
192
+ index_toc = list(real_toc_indexes.values())
193
+ curr_index = 0
194
+ for x in range(1, len(toc)):
195
+ document[toc[curr_index].replace("\t", " ")] = re.sub(r"[\ \t]+", " ", "\n".join(text[index_toc[curr_index]+1:index_toc[x]]))
196
+ curr_index = x
197
+
198
+ document[toc[curr_index].replace("\t"," ")] = re.sub(r"\s+", " ", " ".join(text[index_toc[curr_index]+1:]))
199
+ return document
200
+
201
+ def willLower(string: str, lowered: bool):
202
+ return string.lower() if lowered else string
203
+
204
  class DocRequest(BaseModel):
205
  doc_id: str
206
  release: Optional[int] = None
 
223
 
224
  class KeywordRequest(BaseModel):
225
  keywords: str
226
+ case_sensitive: Optional[bool] = False
227
  release: Optional[str] = None
228
  wg: Optional[str] = None
229
  spec_type: Optional[Literal["TS", "TR"]] = None
230
  mode: Optional[Literal["and", "or"]] = "and"
231
 
232
  class KeywordResponse(BaseModel):
233
+ results: List[Dict[str, str|dict]]
234
  search_time: float
235
 
236
  class TsgDocFinder:
 
358
  def __init__(self):
359
  self.chars = "0123456789abcdefghijklmnopqrstuvwxyz"
360
  self.indexer_file = "indexed_specifications.json"
361
+ self.doc_zip = "indexed_docs_content.zip"
362
  self.indexer_specs, self.indexer_scopes, self.last_indexer_date = self.load_indexer()
363
+ self.indexer_documents = self.load_documents()
364
 
365
  def load_indexer(self):
366
  """Load existing index if available"""
 
370
  return x["specs"], x["scopes"], x["last_indexed_date"]
371
  return {}, {}, None
372
 
373
+ def load_documents(self):
374
+ if os.path.exists(self.doc_zip):
375
+ with zipfile.ZipFile(open(self.doc_zip, "rb")) as zf:
376
+ for file_name in zf.namelist():
377
+ if file_name.endswith(".json"):
378
+ doc_bytes = zf.read(file_name)
379
+ try:
380
+ doc_data = json.loads(doc_bytes.decode("utf-8"))
381
+ print("Documents loaded successfully !")
382
+ return doc_data
383
+ except json.JSONDecodeError as e:
384
+ print(f"Error while decoding the JSON file {file_name}: {e}")
385
+ print("Failed !")
386
+ return {}
387
+
388
+ def get_document(self, spec, version):
389
+ doc = self.indexer_documents.get(spec)
390
+ if doc:
391
+ return doc
392
+ else:
393
+ return get_spec_content(spec, version)
394
+
395
+ def get_section(self, doc, chapter):
396
+ return doc[chapter]
397
+
398
  def save_indexer(self):
399
  """Save the updated index"""
400
  self.last_indexer_date = today.strftime("%d/%m/%Y-%H:%M:%S")
 
448
  @app.post("/search-spec", response_model=KeywordResponse)
449
  def search_spec(request: KeywordRequest):
450
  start_time = time.time()
451
+ booleanLowered = request.case_sensitive
452
+ kws = [willLower(_, booleanLowered) for _ in request.keywords.split(" ")]
453
+ unique_specs = set()
454
  results = []
455
 
456
  for string, spec in finder_spec.indexer_specs.items():
457
+ put = False
458
+ if spec['id'] in unique_specs:
459
+ continue
460
+
 
 
461
  release = request.release
462
  working_group = request.wg
463
  spec_type = request.spec_type
 
469
  if spec_type is not None and spec["type"] != spec_type:
470
  continue
471
 
472
+ contents = []
473
+ version = finder_spec.search_document(spec['id'], spec['release']).split("/")[-1].replace(".zip", "").split("-")[-1]
474
+ doc = finder_spec.get_document(spec['id'], version)
475
+ docValid = not isinstance(doc, str)
476
+
477
+ if request.mode == "and":
478
+ if all(kw in willLower(string, booleanLowered).split("+-+") for kw in kws):
479
+ put = True
480
+ if docValid:
481
+ for chapter in list(doc.keys())[1:]:
482
+ if "references" not in chapter.lower():
483
+ if all(kw in willLower(doc[chapter], booleanLowered).split(" ") for kw in kws):
484
+ put = True
485
+ contents.append(chapter)
486
+ elif request.mode == "or":
487
+ if any(kw in willLower(string, booleanLowered).split("+-+") for kw in kws):
488
+ put = True
489
+ if docValid:
490
+ for chapter in list(doc.keys())[1:]:
491
+ if "references" not in chapter.lower():
492
+ if any(kw in willLower(doc[chapter], booleanLowered).split(" ") for kw in kws):
493
+ put = True
494
+ contents.append(chapter)
495
+
496
+ if put:
497
+ spec_content = spec
498
+ spec_content["contains"] = {chap: doc[chap] for chap in contents}
499
+
500
+ results.append(spec_content)
501
+ else:
502
+ unique_specs.add(spec['id'])
503
  if len(results) > 0:
504
  return KeywordResponse(
505
  results=results,
 
508
  else:
509
  raise HTTPException(status_code=404, detail="Specifications not found")
510
 
511
+ # @app.post("/search-spec", response_model=KeywordResponse)
512
+ # def search_spec(request: KeywordRequest):
513
+ # start_time = time.time()
514
+ # kws = [_.lower() for _ in request.keywords.split(" ")]
515
+ # results = []
516
+
517
+ # for string, spec in finder_spec.indexer_specs.items():
518
+ # if request.mode == "and":
519
+ # if not all(kw in string.lower() for kw in kws):
520
+ # continue
521
+ # elif request.mode == "or":
522
+ # if not any(kw in string.lower() for kw in kws):
523
+ # continue
524
+ # release = request.release
525
+ # working_group = request.wg
526
+ # spec_type = request.spec_type
527
+
528
+ # if spec.get('version', None) is None or (release is not None and spec["version"].split(".")[0] != str(release)):
529
+ # continue
530
+ # if spec.get('working_group', None) is None or (working_group is not None and spec["working_group"] != working_group):
531
+ # continue
532
+ # if spec_type is not None and spec["type"] != spec_type:
533
+ # continue
534
+
535
+ # results.append(spec)
536
+ # if len(results) > 0:
537
+ # return KeywordResponse(
538
+ # results=results,
539
+ # search_time=time.time() - start_time
540
+ # )
541
+ # else:
542
+ # raise HTTPException(status_code=404, detail="Specifications not found")
543
+
544
  # def search_spec(request: KeywordRequest):
545
  # chars = "0123456789abcdefghijklmnopqrstuvwxyz"
546
  # start_time = time.time()
 
620
  def find_document(request: DocRequest):
621
  start_time = time.time()
622
  finder = finder_tsg if request.doc_id[0].isalpha() else finder_spec
 
 
623
  result = finder.search_document(request.doc_id, request.release)
624
 
625
  if "not found" not in result and "Could not" not in result and "Unable" not in result: