om4r932 commited on
Commit
7adc29d
·
1 Parent(s): 9c5671b

Change SDK to Docker + New API

Browse files
Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ RUN apt-get update && \
4
+ apt-get clean && rm -rf /var/lib/apt/lists/*
5
+
6
+ RUN useradd -m -u 1000 user
7
+ USER user
8
+ ENV PATH="/home/user/.local/bin:$PATH"
9
+
10
+ WORKDIR /app
11
+
12
+ COPY --chown=user ./requirements.txt requirements.txt
13
+ RUN pip install --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host files.pythonhosted.org --no-cache-dir --upgrade -r requirements.txt
14
+
15
+ COPY --chown=user . /app
16
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,12 +1,12 @@
1
  ---
2
- title: 3GPPSpecViewer
3
  emoji: 👁
4
- colorFrom: purple
5
- colorTo: pink
6
- sdk: static
7
- pinned: true
8
  license: gpl-3.0
9
- short_description: A static website that displays 3GPP specifications via JSON
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: 3GPPSpecSplitter
3
  emoji: 👁
4
+ colorFrom: blue
5
+ colorTo: red
6
+ sdk: docker
7
+ pinned: false
8
  license: gpl-3.0
9
+ short_description: API for splitting 3GPP specifications by their chapters
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from io import StringIO
2
+ import numpy as np
3
+ import pandas as pd
4
+ import requests
5
+ from bs4 import BeautifulSoup
6
+ import json
7
+ import os
8
+ import traceback
9
+ import uuid
10
+ import zipfile
11
+ import io
12
+ import subprocess
13
+ import os
14
+ import re
15
+ import time
16
+ from datetime import datetime
17
+ from dotenv import load_dotenv
18
+ import warnings
19
+ from fastapi import FastAPI, HTTPException
20
+ from fastapi.middleware.cors import CORSMiddleware
21
+ from fastapi.responses import FileResponse
22
+ from fastapi.staticfiles import StaticFiles
23
+ from pydantic import BaseModel
24
+ from typing import Any, Dict, List, Literal, Optional
25
+
26
+ load_dotenv()
27
+
28
+ warnings.filterwarnings("ignore")
29
+
30
+ app = FastAPI(title="3GPP Specification Splitter API",
31
+ description="API to split and display specifications by their chapters & sub-chapters")
32
+
33
+ app.mount("/static", StaticFiles(directory="static"), name="static")
34
+
35
+ origins = [
36
+ "*",
37
+ ]
38
+
39
+ app.add_middleware(
40
+ CORSMiddleware,
41
+ allow_origins=origins,
42
+ allow_credentials=True,
43
+ allow_methods=["*"],
44
+ allow_headers=["*"],
45
+ )
46
+
47
+ def get_text(specification: str, version: str):
48
+ """Récupère les bytes du PDF à partir d'une spécification et d'une version."""
49
+ doc_id = specification
50
+ series = doc_id.split(".")[0]
51
+
52
+ response = requests.get(
53
+ f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}/{doc_id.replace('.', '')}-{version}.zip",
54
+ verify=False,
55
+ headers={"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
56
+ )
57
+
58
+ if response.status_code != 200:
59
+ raise Exception(f"Téléchargement du ZIP échoué pour {specification}-{version}")
60
+
61
+ zip_bytes = io.BytesIO(response.content)
62
+
63
+ with zipfile.ZipFile(zip_bytes) as zf:
64
+ for file_name in zf.namelist():
65
+ if file_name.endswith("zip"):
66
+ print("Another ZIP !")
67
+ zip_bytes = io.BytesIO(zf.read(file_name))
68
+ zf = zipfile.ZipFile(zip_bytes)
69
+ for file_name2 in zf.namelist():
70
+ if file_name2.endswith("doc") or file_name2.endswith("docx"):
71
+ if "cover" in file_name2.lower():
72
+ print("COVER !")
73
+ continue
74
+ ext = file_name2.split(".")[-1]
75
+ doc_bytes = zf.read(file_name2)
76
+ temp_id = str(uuid.uuid4())
77
+ input_path = f"/tmp/{temp_id}.{ext}"
78
+ output_path = f"/tmp/{temp_id}.txt"
79
+
80
+ with open(input_path, "wb") as f:
81
+ f.write(doc_bytes)
82
+
83
+ subprocess.run([
84
+ "libreoffice",
85
+ "--headless",
86
+ "--convert-to", "txt",
87
+ "--outdir", "/tmp",
88
+ input_path
89
+ ], check=True)
90
+
91
+ with open(output_path, "r") as f:
92
+ txt_data = [line.strip() for line in f if line.strip()]
93
+
94
+ os.remove(input_path)
95
+ os.remove(output_path)
96
+ return txt_data
97
+ elif file_name.endswith("doc") or file_name.endswith("docx"):
98
+ if "cover" in file_name.lower():
99
+ print("COVER !")
100
+ continue
101
+ ext = file_name.split(".")[-1]
102
+ doc_bytes = zf.read(file_name)
103
+ temp_id = str(uuid.uuid4())
104
+ input_path = f"/tmp/{temp_id}.{ext}"
105
+ output_path = f"/tmp/{temp_id}.txt"
106
+
107
+ print("Ecriture")
108
+ with open(input_path, "wb") as f:
109
+ f.write(doc_bytes)
110
+
111
+ print("Convertissement")
112
+ subprocess.run([
113
+ "libreoffice",
114
+ "--headless",
115
+ "--convert-to", "txt",
116
+ "--outdir", "/tmp",
117
+ input_path
118
+ ], check=True)
119
+
120
+ print("Ecriture TXT")
121
+ with open(output_path, "r", encoding="utf-8") as f:
122
+ txt_data = [line.strip() for line in f if line.strip()]
123
+
124
+ os.remove(input_path)
125
+ os.remove(output_path)
126
+ return txt_data
127
+
128
+ raise Exception(f"Aucun fichier .doc/.docx trouvé dans le ZIP pour {specification}-{version}")
129
+
130
+ def get_latest_version(spec: str) -> str:
131
+ try:
132
+ req = requests.post("https://organizedprogrammers-3gppdocfinder/find", headers={"Accept": "application/json"}, json=json.dumps({"doc_id": spec}))
133
+ except Exception as e:
134
+ raise HTTPException(500, f"An error has occured while getting latest version: {e}")
135
+ if req.status_code == 200:
136
+ reqJS = req.json
137
+ return reqJS['version']
138
+ else:
139
+ raise HTTPException(req.status_code, "An error has occured while getting latest version")
140
+
141
+ class SpecRequest(BaseModel):
142
+ specification: str
143
+ version: Optional[str] = None
144
+
145
+ @app.get("/")
146
+ def main_page():
147
+ return FileResponse(os.path.join("templates", "index.html"))
148
+
149
+ @app.post("/from-search")
150
+ def get_file_from_spec_id_version(req: SpecRequest) -> Dict[str, str]:
151
+ spec = req.specification
152
+ version = req.version
153
+ if not version:
154
+ version = get_latest_version(spec)
155
+
156
+ text = get_text(spec, version)
157
+ forewords = []
158
+ for x in range(len(text)):
159
+ line = text[x]
160
+ if "Foreword" in line:
161
+ forewords.append(x)
162
+ if len(forewords) >= 2:
163
+ break
164
+
165
+ toc_brut = text[forewords[0]:forewords[1]]
166
+ chapters = []
167
+ for line in toc_brut:
168
+ x = line.split("\t")
169
+ if re.fullmatch(r"^\d\t[A-Z][a-zA-Z0-9\s,;!?'.-]*$", line):
170
+ chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
171
+ if re.fullmatch(r"^\d\.\d\t[A-Z][a-zA-Z0-9\s,;!?'.-]*$", line):
172
+ chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
173
+ if re.fullmatch(r"^\d\.\d\.\d\t[A-Z][a-zA-Z0-9\s,;!?'.-]*$", line):
174
+ chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
175
+ if re.fullmatch(r"^\d\.\d\.\d.\d\t[A-Z][a-zA-Z0-9\s,;!?'.-]*$", line):
176
+ chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
177
+ if re.fullmatch(r"^\d\.\d\.\d.\d.\d\t[A-Z][a-zA-Z0-9\s,;!?'.-]*$", line):
178
+ chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
179
+
180
+ real_toc_indexes = {}
181
+
182
+ for chapter in chapters:
183
+ try:
184
+ x = text.index(chapter)
185
+ real_toc_indexes[chapter] = x
186
+ except ValueError as e:
187
+ real_toc_indexes[chapter] = -float("inf")
188
+
189
+ document = {}
190
+ toc = list(real_toc_indexes.keys())
191
+ index_toc = list(real_toc_indexes.values())
192
+ curr_index = 0
193
+ for x in range(1, len(toc)):
194
+ document[toc[curr_index].replace("\t", " ")] = re.sub(r"[\ \t]+", " ", "\n".join(text[index_toc[curr_index]+1:index_toc[x]]))
195
+ curr_index = x
196
+
197
+ document[toc[curr_index].replace("\t"," ")] = re.sub(r"\s+", " ", " ".join(text[index_toc[curr_index]+1:]))
198
+ return document
script.js → static/script.js RENAMED
File without changes
style.css → static/style.css RENAMED
File without changes
index.html → templates/index.html RENAMED
@@ -5,7 +5,7 @@
5
  <meta charset="UTF-8">
6
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
7
  <title>3GPP Specification Visualizor</title>
8
- <link rel="stylesheet" href="style.css">
9
  </head>
10
 
11
  <body>
@@ -18,7 +18,7 @@
18
  <div id="document-container">
19
  <div class="loading">Veuillez charger un fichier de spécification 3GPP</div>
20
  </div>
21
- <script src="script.js"></script>
22
  </body>
23
 
24
  </html>
 
5
  <meta charset="UTF-8">
6
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
7
  <title>3GPP Specification Visualizor</title>
8
+ <link rel="stylesheet" href="static/style.css">
9
  </head>
10
 
11
  <body>
 
18
  <div id="document-container">
19
  <div class="loading">Veuillez charger un fichier de spécification 3GPP</div>
20
  </div>
21
+ <script src="static/script.js"></script>
22
  </body>
23
 
24
  </html>