Change SDK to Docker + New API
Browse files- Dockerfile +16 -0
- README.md +6 -6
- app.py +198 -0
- script.js → static/script.js +0 -0
- style.css → static/style.css +0 -0
- index.html → templates/index.html +2 -2
Dockerfile
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9
|
2 |
+
|
3 |
+
RUN apt-get update && \
|
4 |
+
apt-get clean && rm -rf /var/lib/apt/lists/*
|
5 |
+
|
6 |
+
RUN useradd -m -u 1000 user
|
7 |
+
USER user
|
8 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
9 |
+
|
10 |
+
WORKDIR /app
|
11 |
+
|
12 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
13 |
+
RUN pip install --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host files.pythonhosted.org --no-cache-dir --upgrade -r requirements.txt
|
14 |
+
|
15 |
+
COPY --chown=user . /app
|
16 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
@@ -1,12 +1,12 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
emoji: 👁
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
-
sdk:
|
7 |
-
pinned:
|
8 |
license: gpl-3.0
|
9 |
-
short_description:
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: 3GPPSpecSplitter
|
3 |
emoji: 👁
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: red
|
6 |
+
sdk: docker
|
7 |
+
pinned: false
|
8 |
license: gpl-3.0
|
9 |
+
short_description: API for splitting 3GPP specifications by their chapters
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from io import StringIO
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
import requests
|
5 |
+
from bs4 import BeautifulSoup
|
6 |
+
import json
|
7 |
+
import os
|
8 |
+
import traceback
|
9 |
+
import uuid
|
10 |
+
import zipfile
|
11 |
+
import io
|
12 |
+
import subprocess
|
13 |
+
import os
|
14 |
+
import re
|
15 |
+
import time
|
16 |
+
from datetime import datetime
|
17 |
+
from dotenv import load_dotenv
|
18 |
+
import warnings
|
19 |
+
from fastapi import FastAPI, HTTPException
|
20 |
+
from fastapi.middleware.cors import CORSMiddleware
|
21 |
+
from fastapi.responses import FileResponse
|
22 |
+
from fastapi.staticfiles import StaticFiles
|
23 |
+
from pydantic import BaseModel
|
24 |
+
from typing import Any, Dict, List, Literal, Optional
|
25 |
+
|
26 |
+
load_dotenv()
|
27 |
+
|
28 |
+
warnings.filterwarnings("ignore")
|
29 |
+
|
30 |
+
app = FastAPI(title="3GPP Specification Splitter API",
|
31 |
+
description="API to split and display specifications by their chapters & sub-chapters")
|
32 |
+
|
33 |
+
app.mount("/static", StaticFiles(directory="static"), name="static")
|
34 |
+
|
35 |
+
origins = [
|
36 |
+
"*",
|
37 |
+
]
|
38 |
+
|
39 |
+
app.add_middleware(
|
40 |
+
CORSMiddleware,
|
41 |
+
allow_origins=origins,
|
42 |
+
allow_credentials=True,
|
43 |
+
allow_methods=["*"],
|
44 |
+
allow_headers=["*"],
|
45 |
+
)
|
46 |
+
|
47 |
+
def get_text(specification: str, version: str):
|
48 |
+
"""Récupère les bytes du PDF à partir d'une spécification et d'une version."""
|
49 |
+
doc_id = specification
|
50 |
+
series = doc_id.split(".")[0]
|
51 |
+
|
52 |
+
response = requests.get(
|
53 |
+
f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}/{doc_id.replace('.', '')}-{version}.zip",
|
54 |
+
verify=False,
|
55 |
+
headers={"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
|
56 |
+
)
|
57 |
+
|
58 |
+
if response.status_code != 200:
|
59 |
+
raise Exception(f"Téléchargement du ZIP échoué pour {specification}-{version}")
|
60 |
+
|
61 |
+
zip_bytes = io.BytesIO(response.content)
|
62 |
+
|
63 |
+
with zipfile.ZipFile(zip_bytes) as zf:
|
64 |
+
for file_name in zf.namelist():
|
65 |
+
if file_name.endswith("zip"):
|
66 |
+
print("Another ZIP !")
|
67 |
+
zip_bytes = io.BytesIO(zf.read(file_name))
|
68 |
+
zf = zipfile.ZipFile(zip_bytes)
|
69 |
+
for file_name2 in zf.namelist():
|
70 |
+
if file_name2.endswith("doc") or file_name2.endswith("docx"):
|
71 |
+
if "cover" in file_name2.lower():
|
72 |
+
print("COVER !")
|
73 |
+
continue
|
74 |
+
ext = file_name2.split(".")[-1]
|
75 |
+
doc_bytes = zf.read(file_name2)
|
76 |
+
temp_id = str(uuid.uuid4())
|
77 |
+
input_path = f"/tmp/{temp_id}.{ext}"
|
78 |
+
output_path = f"/tmp/{temp_id}.txt"
|
79 |
+
|
80 |
+
with open(input_path, "wb") as f:
|
81 |
+
f.write(doc_bytes)
|
82 |
+
|
83 |
+
subprocess.run([
|
84 |
+
"libreoffice",
|
85 |
+
"--headless",
|
86 |
+
"--convert-to", "txt",
|
87 |
+
"--outdir", "/tmp",
|
88 |
+
input_path
|
89 |
+
], check=True)
|
90 |
+
|
91 |
+
with open(output_path, "r") as f:
|
92 |
+
txt_data = [line.strip() for line in f if line.strip()]
|
93 |
+
|
94 |
+
os.remove(input_path)
|
95 |
+
os.remove(output_path)
|
96 |
+
return txt_data
|
97 |
+
elif file_name.endswith("doc") or file_name.endswith("docx"):
|
98 |
+
if "cover" in file_name.lower():
|
99 |
+
print("COVER !")
|
100 |
+
continue
|
101 |
+
ext = file_name.split(".")[-1]
|
102 |
+
doc_bytes = zf.read(file_name)
|
103 |
+
temp_id = str(uuid.uuid4())
|
104 |
+
input_path = f"/tmp/{temp_id}.{ext}"
|
105 |
+
output_path = f"/tmp/{temp_id}.txt"
|
106 |
+
|
107 |
+
print("Ecriture")
|
108 |
+
with open(input_path, "wb") as f:
|
109 |
+
f.write(doc_bytes)
|
110 |
+
|
111 |
+
print("Convertissement")
|
112 |
+
subprocess.run([
|
113 |
+
"libreoffice",
|
114 |
+
"--headless",
|
115 |
+
"--convert-to", "txt",
|
116 |
+
"--outdir", "/tmp",
|
117 |
+
input_path
|
118 |
+
], check=True)
|
119 |
+
|
120 |
+
print("Ecriture TXT")
|
121 |
+
with open(output_path, "r", encoding="utf-8") as f:
|
122 |
+
txt_data = [line.strip() for line in f if line.strip()]
|
123 |
+
|
124 |
+
os.remove(input_path)
|
125 |
+
os.remove(output_path)
|
126 |
+
return txt_data
|
127 |
+
|
128 |
+
raise Exception(f"Aucun fichier .doc/.docx trouvé dans le ZIP pour {specification}-{version}")
|
129 |
+
|
130 |
+
def get_latest_version(spec: str) -> str:
|
131 |
+
try:
|
132 |
+
req = requests.post("https://organizedprogrammers-3gppdocfinder/find", headers={"Accept": "application/json"}, json=json.dumps({"doc_id": spec}))
|
133 |
+
except Exception as e:
|
134 |
+
raise HTTPException(500, f"An error has occured while getting latest version: {e}")
|
135 |
+
if req.status_code == 200:
|
136 |
+
reqJS = req.json
|
137 |
+
return reqJS['version']
|
138 |
+
else:
|
139 |
+
raise HTTPException(req.status_code, "An error has occured while getting latest version")
|
140 |
+
|
141 |
+
class SpecRequest(BaseModel):
|
142 |
+
specification: str
|
143 |
+
version: Optional[str] = None
|
144 |
+
|
145 |
+
@app.get("/")
|
146 |
+
def main_page():
|
147 |
+
return FileResponse(os.path.join("templates", "index.html"))
|
148 |
+
|
149 |
+
@app.post("/from-search")
|
150 |
+
def get_file_from_spec_id_version(req: SpecRequest) -> Dict[str, str]:
|
151 |
+
spec = req.specification
|
152 |
+
version = req.version
|
153 |
+
if not version:
|
154 |
+
version = get_latest_version(spec)
|
155 |
+
|
156 |
+
text = get_text(spec, version)
|
157 |
+
forewords = []
|
158 |
+
for x in range(len(text)):
|
159 |
+
line = text[x]
|
160 |
+
if "Foreword" in line:
|
161 |
+
forewords.append(x)
|
162 |
+
if len(forewords) >= 2:
|
163 |
+
break
|
164 |
+
|
165 |
+
toc_brut = text[forewords[0]:forewords[1]]
|
166 |
+
chapters = []
|
167 |
+
for line in toc_brut:
|
168 |
+
x = line.split("\t")
|
169 |
+
if re.fullmatch(r"^\d\t[A-Z][a-zA-Z0-9\s,;!?'.-]*$", line):
|
170 |
+
chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
|
171 |
+
if re.fullmatch(r"^\d\.\d\t[A-Z][a-zA-Z0-9\s,;!?'.-]*$", line):
|
172 |
+
chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
|
173 |
+
if re.fullmatch(r"^\d\.\d\.\d\t[A-Z][a-zA-Z0-9\s,;!?'.-]*$", line):
|
174 |
+
chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
|
175 |
+
if re.fullmatch(r"^\d\.\d\.\d.\d\t[A-Z][a-zA-Z0-9\s,;!?'.-]*$", line):
|
176 |
+
chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
|
177 |
+
if re.fullmatch(r"^\d\.\d\.\d.\d.\d\t[A-Z][a-zA-Z0-9\s,;!?'.-]*$", line):
|
178 |
+
chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
|
179 |
+
|
180 |
+
real_toc_indexes = {}
|
181 |
+
|
182 |
+
for chapter in chapters:
|
183 |
+
try:
|
184 |
+
x = text.index(chapter)
|
185 |
+
real_toc_indexes[chapter] = x
|
186 |
+
except ValueError as e:
|
187 |
+
real_toc_indexes[chapter] = -float("inf")
|
188 |
+
|
189 |
+
document = {}
|
190 |
+
toc = list(real_toc_indexes.keys())
|
191 |
+
index_toc = list(real_toc_indexes.values())
|
192 |
+
curr_index = 0
|
193 |
+
for x in range(1, len(toc)):
|
194 |
+
document[toc[curr_index].replace("\t", " ")] = re.sub(r"[\ \t]+", " ", "\n".join(text[index_toc[curr_index]+1:index_toc[x]]))
|
195 |
+
curr_index = x
|
196 |
+
|
197 |
+
document[toc[curr_index].replace("\t"," ")] = re.sub(r"\s+", " ", " ".join(text[index_toc[curr_index]+1:]))
|
198 |
+
return document
|
script.js → static/script.js
RENAMED
File without changes
|
style.css → static/style.css
RENAMED
File without changes
|
index.html → templates/index.html
RENAMED
@@ -5,7 +5,7 @@
|
|
5 |
<meta charset="UTF-8">
|
6 |
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
7 |
<title>3GPP Specification Visualizor</title>
|
8 |
-
<link rel="stylesheet" href="style.css">
|
9 |
</head>
|
10 |
|
11 |
<body>
|
@@ -18,7 +18,7 @@
|
|
18 |
<div id="document-container">
|
19 |
<div class="loading">Veuillez charger un fichier de spécification 3GPP</div>
|
20 |
</div>
|
21 |
-
<script src="script.js"></script>
|
22 |
</body>
|
23 |
|
24 |
</html>
|
|
|
5 |
<meta charset="UTF-8">
|
6 |
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
7 |
<title>3GPP Specification Visualizor</title>
|
8 |
+
<link rel="stylesheet" href="static/style.css">
|
9 |
</head>
|
10 |
|
11 |
<body>
|
|
|
18 |
<div id="document-container">
|
19 |
<div class="loading">Veuillez charger un fichier de spécification 3GPP</div>
|
20 |
</div>
|
21 |
+
<script src="static/script.js"></script>
|
22 |
</body>
|
23 |
|
24 |
</html>
|