Spaces:

OrganizedProgrammers
/

3GPPDocFinder

Running

App Files Files Community

om4r932 commited on 1 day ago

Commit

5110615

1 Parent(s): 1de4995

Update search specification

Browse files

Files changed (1) hide show

app.py +156 -11

app.py CHANGED Viewed

@@ -145,6 +145,62 @@ def get_scope(specification: str, version: str):
         traceback.print_exception(e)
         return "Not found (error)"
 class DocRequest(BaseModel):
     doc_id: str
     release: Optional[int] = None
@@ -167,13 +223,14 @@ class BatchDocResponse(BaseModel):
 class KeywordRequest(BaseModel):
     keywords: str
     release: Optional[str] = None
     wg: Optional[str] = None
     spec_type: Optional[Literal["TS", "TR"]] = None
     mode: Optional[Literal["and", "or"]] = "and"
 class KeywordResponse(BaseModel):
-    results: List[Dict[str, str]]
     search_time: float
 class TsgDocFinder:
@@ -301,7 +358,9 @@ class SpecDocFinder:
     def __init__(self):
         self.chars = "0123456789abcdefghijklmnopqrstuvwxyz"
         self.indexer_file = "indexed_specifications.json"
         self.indexer_specs, self.indexer_scopes, self.last_indexer_date = self.load_indexer()
     def load_indexer(self):
         """Load existing index if available"""
@@ -311,6 +370,31 @@ class SpecDocFinder:
                 return x["specs"], x["scopes"], x["last_indexed_date"]
         return {}, {}, None
     def save_indexer(self):
         """Save the updated index"""
         self.last_indexer_date = today.strftime("%d/%m/%Y-%H:%M:%S")
@@ -364,16 +448,16 @@ async def main_menu():
 @app.post("/search-spec", response_model=KeywordResponse)
 def search_spec(request: KeywordRequest):
     start_time = time.time()
-    kws = [_.lower() for _ in request.keywords.split(" ")]
     results = []
     for string, spec in finder_spec.indexer_specs.items():
-        if request.mode == "and":
-            if not all(kw in string.lower() for kw in kws):
-                continue
-        elif request.mode == "or":
-            if not any(kw in string.lower() for kw in kws):
-                continue
         release = request.release
         working_group = request.wg
         spec_type = request.spec_type
@@ -385,7 +469,37 @@ def search_spec(request: KeywordRequest):
         if spec_type is not None and spec["type"] != spec_type:
             continue
-        results.append(spec)
     if len(results) > 0:
         return KeywordResponse(
             results=results,
@@ -394,6 +508,39 @@ def search_spec(request: KeywordRequest):
     else:
         raise HTTPException(status_code=404, detail="Specifications not found")
 # def search_spec(request: KeywordRequest):
 #     chars = "0123456789abcdefghijklmnopqrstuvwxyz"
 #     start_time = time.time()
@@ -473,8 +620,6 @@ def search_spec(request: KeywordRequest):
 def find_document(request: DocRequest):
     start_time = time.time()
     finder = finder_tsg if request.doc_id[0].isalpha() else finder_spec
-    print(finder)
     result = finder.search_document(request.doc_id, request.release)
     if "not found" not in result and "Could not" not in result and "Unable" not in result:

         traceback.print_exception(e)
         return "Not found (error)"
+def get_spec_content(specification: str, version: str):
+    text = get_text(specification, version)
+    forewords = []
+    for x in range(len(text)):
+        line = text[x]
+        if "Foreword" in line:
+            forewords.append(x)
+        if len(forewords) >= 2:
+            break
+    toc_brut = text[forewords[0]:forewords[1]]
+    chapters = []
+    for line in toc_brut:
+        x = line.split("\t")
+        if re.search(r"^\d+\t[\ \S]+", line):
+            chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
+        if re.search(r"^\d+\.\d+\t[\ \S]+", line):
+            chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
+        if re.search(r"^\d+\.\d+\.\d+\t[\ \S]+", line):
+            chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
+        if re.search(r"^\d+\.\d+\.\d+.\d+\t[\ \S]+", line):
+            chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
+        if re.search(r"^\d+\.\d+\.\d+.\d+.\d+\t[\ \S]+", line):
+            chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
+    real_toc_indexes = {}
+    for chapter in chapters:
+        try:
+            x = text.index(chapter)
+            real_toc_indexes[chapter] = x
+        except ValueError as e:
+            try:
+                number = chapter.split("\t")[0] + "\t"
+                for line in text[forewords[1]:]:
+                    if number in line:
+                        x = text.index(line)
+                        real_toc_indexes[line] = x
+                        break
+            except:
+                real_toc_indexes[chapter] = -float("inf")
+    document = {}
+    toc = list(real_toc_indexes.keys())
+    index_toc = list(real_toc_indexes.values())
+    curr_index = 0
+    for x in range(1, len(toc)):
+        document[toc[curr_index].replace("\t", " ")] = re.sub(r"[\ \t]+", " ", "\n".join(text[index_toc[curr_index]+1:index_toc[x]]))
+        curr_index = x
+    document[toc[curr_index].replace("\t"," ")] = re.sub(r"\s+", " ", " ".join(text[index_toc[curr_index]+1:]))
+    return document
+def willLower(string: str, lowered: bool):
+    return string.lower() if lowered else string
 class DocRequest(BaseModel):
     doc_id: str
     release: Optional[int] = None
 class KeywordRequest(BaseModel):
     keywords: str
+    case_sensitive: Optional[bool] = False
     release: Optional[str] = None
     wg: Optional[str] = None
     spec_type: Optional[Literal["TS", "TR"]] = None
     mode: Optional[Literal["and", "or"]] = "and"
 class KeywordResponse(BaseModel):
+    results: List[Dict[str, str|dict]]
     search_time: float
 class TsgDocFinder:
     def __init__(self):
         self.chars = "0123456789abcdefghijklmnopqrstuvwxyz"
         self.indexer_file = "indexed_specifications.json"
+        self.doc_zip = "indexed_docs_content.zip"
         self.indexer_specs, self.indexer_scopes, self.last_indexer_date = self.load_indexer()
+        self.indexer_documents = self.load_documents()
     def load_indexer(self):
         """Load existing index if available"""
                 return x["specs"], x["scopes"], x["last_indexed_date"]
         return {}, {}, None
+    def load_documents(self):
+        if os.path.exists(self.doc_zip):
+            with zipfile.ZipFile(open(self.doc_zip, "rb")) as zf:
+                for file_name in zf.namelist():
+                    if file_name.endswith(".json"):
+                        doc_bytes = zf.read(file_name)
+                        try:
+                            doc_data = json.loads(doc_bytes.decode("utf-8"))
+                            print("Documents loaded successfully !")
+                            return doc_data
+                        except json.JSONDecodeError as e:
+                            print(f"Error while decoding the JSON file {file_name}: {e}")
+        print("Failed !")
+        return {}
+    def get_document(self, spec, version):
+        doc = self.indexer_documents.get(spec)
+        if doc:
+            return doc
+        else:
+            return get_spec_content(spec, version)
+    def get_section(self, doc, chapter):
+        return doc[chapter]
     def save_indexer(self):
         """Save the updated index"""
         self.last_indexer_date = today.strftime("%d/%m/%Y-%H:%M:%S")
 @app.post("/search-spec", response_model=KeywordResponse)
 def search_spec(request: KeywordRequest):
     start_time = time.time()
+    booleanLowered = request.case_sensitive
+    kws = [willLower(_, booleanLowered) for _ in request.keywords.split(" ")]
+    unique_specs = set()
     results = []
     for string, spec in finder_spec.indexer_specs.items():
+        put = False
+        if spec['id'] in unique_specs:
+            continue
         release = request.release
         working_group = request.wg
         spec_type = request.spec_type
         if spec_type is not None and spec["type"] != spec_type:
             continue
+        contents = []
+        version = finder_spec.search_document(spec['id'], spec['release']).split("/")[-1].replace(".zip", "").split("-")[-1]
+        doc = finder_spec.get_document(spec['id'], version)
+        docValid = not isinstance(doc, str)
+        if request.mode == "and":
+            if all(kw in willLower(string, booleanLowered).split("+-+") for kw in kws):
+                put = True
+            if docValid:
+                for chapter in list(doc.keys())[1:]:
+                    if "references" not in chapter.lower():
+                        if all(kw in willLower(doc[chapter], booleanLowered).split(" ") for kw in kws):
+                            put = True
+                            contents.append(chapter)
+        elif request.mode == "or":
+            if any(kw in willLower(string, booleanLowered).split("+-+") for kw in kws):
+                put = True
+            if docValid:
+                for chapter in list(doc.keys())[1:]:
+                    if "references" not in chapter.lower():
+                        if any(kw in willLower(doc[chapter], booleanLowered).split(" ") for kw in kws):
+                            put = True
+                            contents.append(chapter)
+        if put:
+            spec_content = spec
+            spec_content["contains"] = {chap: doc[chap] for chap in contents}
+            results.append(spec_content)
+        else:
+            unique_specs.add(spec['id'])
     if len(results) > 0:
         return KeywordResponse(
             results=results,
     else:
         raise HTTPException(status_code=404, detail="Specifications not found")
+# @app.post("/search-spec", response_model=KeywordResponse)
+# def search_spec(request: KeywordRequest):
+#     start_time = time.time()
+#     kws = [_.lower() for _ in request.keywords.split(" ")]
+#     results = []
+#     for string, spec in finder_spec.indexer_specs.items():
+#         if request.mode == "and":
+#             if not all(kw in string.lower() for kw in kws):
+#                 continue
+#         elif request.mode == "or":
+#             if not any(kw in string.lower() for kw in kws):
+#                 continue
+#         release = request.release
+#         working_group = request.wg
+#         spec_type = request.spec_type
+#         if spec.get('version', None) is None or (release is not None and spec["version"].split(".")[0] != str(release)):
+#             continue
+#         if spec.get('working_group', None) is None or (working_group is not None and spec["working_group"] != working_group):
+#             continue
+#         if spec_type is not None and spec["type"] != spec_type:
+#             continue
+#         results.append(spec)
+#     if len(results) > 0:
+#         return KeywordResponse(
+#             results=results,
+#             search_time=time.time() - start_time
+#         )
+#     else:
+#         raise HTTPException(status_code=404, detail="Specifications not found")
 # def search_spec(request: KeywordRequest):
 #     chars = "0123456789abcdefghijklmnopqrstuvwxyz"
 #     start_time = time.time()
 def find_document(request: DocRequest):
     start_time = time.time()
     finder = finder_tsg if request.doc_id[0].isalpha() else finder_spec
     result = finder.search_document(request.doc_id, request.release)
     if "not found" not in result and "Could not" not in result and "Unable" not in result: