Major improvement: semantic and token cleaning
Browse files
app.py
CHANGED
@@ -1,13 +1,14 @@
|
|
1 |
import os
|
|
|
2 |
import requests
|
3 |
import pdfplumber
|
4 |
from flask import Flask, render_template, request, redirect, url_for, flash, send_file
|
5 |
from werkzeug.utils import secure_filename
|
6 |
from sentence_transformers import SentenceTransformer, util
|
7 |
from transformers import AutoTokenizer
|
8 |
-
from fpdf import FPDF
|
9 |
from collections import Counter
|
10 |
-
from io import BytesIO
|
11 |
|
12 |
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
13 |
|
@@ -21,6 +22,20 @@ model = SentenceTransformer("allenai/scibert_scivocab_uncased")
|
|
21 |
last_results = []
|
22 |
last_common_keywords = []
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
def extract_pdf_text(pdf_path):
|
25 |
text = ""
|
26 |
try:
|
@@ -29,24 +44,34 @@ def extract_pdf_text(pdf_path):
|
|
29 |
text += page.extract_text() or " "
|
30 |
except Exception as e:
|
31 |
print(f"Errore estrazione testo: {e}")
|
32 |
-
return text
|
33 |
|
|
|
34 |
def preprocess_text(text):
|
35 |
tokens = tokenizer.tokenize(text.lower())
|
36 |
tokens = [token for token in tokens if len(token) > 3 and token.isalpha()]
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
return tokens
|
38 |
|
|
|
39 |
def calculate_token_overlap(text1, text2):
|
40 |
tokens1 = set(text1.split())
|
41 |
tokens2 = set(text2.split())
|
42 |
overlap = len(tokens1 & tokens2)
|
43 |
return round((overlap / max(len(tokens1), 1)) * 100, 2)
|
44 |
|
|
|
45 |
def calculate_oui(similarity, token_overlap, alpha=0.7, beta=0.3):
|
46 |
oui = alpha * (1 - similarity / 100) + beta * (1 - token_overlap / 100)
|
47 |
result = round(oui * 100, 2)
|
48 |
return 0.0 if result == -0.0 else result
|
49 |
|
|
|
50 |
def validate_document(pdf_path, comparison_sources, method="local", titles=None):
|
51 |
pdf_text = extract_pdf_text(pdf_path)
|
52 |
pdf_tokens = preprocess_text(pdf_text)
|
@@ -54,7 +79,7 @@ def validate_document(pdf_path, comparison_sources, method="local", titles=None)
|
|
54 |
all_keywords = []
|
55 |
|
56 |
for i, doc in enumerate(comparison_sources):
|
57 |
-
doc_text = extract_pdf_text(doc) if method == "local" else doc
|
58 |
doc_tokens = preprocess_text(doc_text)
|
59 |
|
60 |
similarity = util.pytorch_cos_sim(
|
@@ -81,6 +106,7 @@ def validate_document(pdf_path, comparison_sources, method="local", titles=None)
|
|
81 |
last_common_keywords = Counter(all_keywords).most_common(10)
|
82 |
return results
|
83 |
|
|
|
84 |
def fetch_pubmed_details(article_id):
|
85 |
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
|
86 |
params = {"db": "pubmed", "id": article_id, "retmode": "xml"}
|
@@ -89,23 +115,21 @@ def fetch_pubmed_details(article_id):
|
|
89 |
response.raise_for_status()
|
90 |
import xml.etree.ElementTree as ET
|
91 |
root = ET.fromstring(response.text)
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
abstract = abstract_element.text.strip() if abstract_element is not None and abstract_element.text else "No Abstract"
|
97 |
-
|
98 |
keywords = root.findall(".//Keyword")
|
99 |
keyword_text = " ".join([kw.text for kw in keywords if kw.text]) if keywords else ""
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
return title, f"{abstract} {keyword_text}"
|
105 |
except Exception as e:
|
106 |
print(f"Errore recupero abstract: {e}")
|
107 |
return None
|
108 |
|
|
|
109 |
def fetch_pubmed(query, year_start, year_end, max_results=10):
|
110 |
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
|
111 |
params = {
|
@@ -113,7 +137,7 @@ def fetch_pubmed(query, year_start, year_end, max_results=10):
|
|
113 |
"term": f"{query} AND ({year_start}[PDAT] : {year_end}[PDAT])",
|
114 |
"retmax": max_results,
|
115 |
"retmode": "json",
|
116 |
-
"sort": "relevance"
|
117 |
}
|
118 |
try:
|
119 |
response = requests.get(base_url, params=params)
|
@@ -159,12 +183,23 @@ def validate():
|
|
159 |
year_end = request.form.get("year_end", "2025")
|
160 |
num_articles = int(request.form.get("num_articles", "10"))
|
161 |
pubmed_ids = fetch_pubmed(query, year_start, year_end, num_articles)
|
|
|
|
|
|
|
|
|
|
|
162 |
pubmed_results = [fetch_pubmed_details(id_) for id_ in pubmed_ids]
|
|
|
|
|
163 |
pubmed_results = [r for r in pubmed_results if r is not None]
|
164 |
|
|
|
|
|
165 |
if not pubmed_results:
|
166 |
flash("Nessun articolo PubMed valido trovato. Modifica la query o il range di anni.", "error")
|
167 |
return redirect(url_for("index"))
|
|
|
|
|
168 |
|
169 |
results = validate_document(pdf_path, [r[1] for r in pubmed_results], method="pubmed", titles=[r[0] for r in pubmed_results])
|
170 |
|
@@ -213,9 +248,9 @@ def download_report():
|
|
213 |
pdf.cell(0, 10, "© 2025 NORUS Tool", 0, 0, "C")
|
214 |
|
215 |
output_path = os.path.join(app.config["UPLOAD_FOLDER"], "NORUS_Report.pdf")
|
216 |
-
pdf.output(output_path, 'F')
|
217 |
|
218 |
-
return send_file(output_path, as_attachment=True)
|
219 |
|
220 |
if __name__ == "__main__":
|
221 |
-
app.run(debug=True, host="0.0.0.0", port=7860)
|
|
|
1 |
import os
|
2 |
+
import re
|
3 |
import requests
|
4 |
import pdfplumber
|
5 |
from flask import Flask, render_template, request, redirect, url_for, flash, send_file
|
6 |
from werkzeug.utils import secure_filename
|
7 |
from sentence_transformers import SentenceTransformer, util
|
8 |
from transformers import AutoTokenizer
|
9 |
+
from fpdf import FPDF
|
10 |
from collections import Counter
|
11 |
+
from io import BytesIO
|
12 |
|
13 |
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
14 |
|
|
|
22 |
last_results = []
|
23 |
last_common_keywords = []
|
24 |
|
25 |
+
# Funzione di cleaning avanzato
|
26 |
+
def clean_text(text):
|
27 |
+
boilerplate_phrases = [
|
28 |
+
"in recent years", "this study", "data suggest that", "in conclusion",
|
29 |
+
"introduction", "methods", "results", "discussion", "this review", "we aimed to",
|
30 |
+
"the aim of this study", "background", "objective", "methodology", "results and discussion"
|
31 |
+
]
|
32 |
+
text = text.lower()
|
33 |
+
for phrase in boilerplate_phrases:
|
34 |
+
text = text.replace(phrase, "")
|
35 |
+
text = re.sub(r'\s+', ' ', text) # Remove extra spaces
|
36 |
+
return text.strip()
|
37 |
+
|
38 |
+
# Funzione di estrazione testo PDF
|
39 |
def extract_pdf_text(pdf_path):
|
40 |
text = ""
|
41 |
try:
|
|
|
44 |
text += page.extract_text() or " "
|
45 |
except Exception as e:
|
46 |
print(f"Errore estrazione testo: {e}")
|
47 |
+
return clean_text(text)
|
48 |
|
49 |
+
# Preprocesso dei token
|
50 |
def preprocess_text(text):
|
51 |
tokens = tokenizer.tokenize(text.lower())
|
52 |
tokens = [token for token in tokens if len(token) > 3 and token.isalpha()]
|
53 |
+
# Lista minimale di stopwords scientifiche comuni
|
54 |
+
stopwords = set([
|
55 |
+
"study", "data", "results", "analysis", "introduction", "conclusion",
|
56 |
+
"method", "methods", "objective", "discussion", "the", "and", "that", "this", "from", "with", "which"
|
57 |
+
])
|
58 |
+
tokens = [token for token in tokens if token not in stopwords]
|
59 |
return tokens
|
60 |
|
61 |
+
# Calcolo token overlap migliorato
|
62 |
def calculate_token_overlap(text1, text2):
|
63 |
tokens1 = set(text1.split())
|
64 |
tokens2 = set(text2.split())
|
65 |
overlap = len(tokens1 & tokens2)
|
66 |
return round((overlap / max(len(tokens1), 1)) * 100, 2)
|
67 |
|
68 |
+
# Formula OUI aggiornata
|
69 |
def calculate_oui(similarity, token_overlap, alpha=0.7, beta=0.3):
|
70 |
oui = alpha * (1 - similarity / 100) + beta * (1 - token_overlap / 100)
|
71 |
result = round(oui * 100, 2)
|
72 |
return 0.0 if result == -0.0 else result
|
73 |
|
74 |
+
# Validazione documento
|
75 |
def validate_document(pdf_path, comparison_sources, method="local", titles=None):
|
76 |
pdf_text = extract_pdf_text(pdf_path)
|
77 |
pdf_tokens = preprocess_text(pdf_text)
|
|
|
79 |
all_keywords = []
|
80 |
|
81 |
for i, doc in enumerate(comparison_sources):
|
82 |
+
doc_text = extract_pdf_text(doc) if method == "local" else clean_text(doc)
|
83 |
doc_tokens = preprocess_text(doc_text)
|
84 |
|
85 |
similarity = util.pytorch_cos_sim(
|
|
|
106 |
last_common_keywords = Counter(all_keywords).most_common(10)
|
107 |
return results
|
108 |
|
109 |
+
# Fetch dettagli articoli da PubMed
|
110 |
def fetch_pubmed_details(article_id):
|
111 |
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
|
112 |
params = {"db": "pubmed", "id": article_id, "retmode": "xml"}
|
|
|
115 |
response.raise_for_status()
|
116 |
import xml.etree.ElementTree as ET
|
117 |
root = ET.fromstring(response.text)
|
118 |
+
title_elem = root.find(".//ArticleTitle")
|
119 |
+
abstract_elem = root.find(".//AbstractText")
|
120 |
+
title = title_elem.text if title_elem is not None else None
|
121 |
+
abstract = abstract_elem.text if abstract_elem is not None else None
|
|
|
|
|
122 |
keywords = root.findall(".//Keyword")
|
123 |
keyword_text = " ".join([kw.text for kw in keywords if kw.text]) if keywords else ""
|
124 |
+
if title and abstract:
|
125 |
+
return title, f"{abstract} {keyword_text}"
|
126 |
+
else:
|
127 |
+
return None
|
|
|
128 |
except Exception as e:
|
129 |
print(f"Errore recupero abstract: {e}")
|
130 |
return None
|
131 |
|
132 |
+
# Fetch ID articoli da PubMed
|
133 |
def fetch_pubmed(query, year_start, year_end, max_results=10):
|
134 |
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
|
135 |
params = {
|
|
|
137 |
"term": f"{query} AND ({year_start}[PDAT] : {year_end}[PDAT])",
|
138 |
"retmax": max_results,
|
139 |
"retmode": "json",
|
140 |
+
"sort": "relevance" # Importantissimo: ordina per rilevanza
|
141 |
}
|
142 |
try:
|
143 |
response = requests.get(base_url, params=params)
|
|
|
183 |
year_end = request.form.get("year_end", "2025")
|
184 |
num_articles = int(request.form.get("num_articles", "10"))
|
185 |
pubmed_ids = fetch_pubmed(query, year_start, year_end, num_articles)
|
186 |
+
|
187 |
+
if not pubmed_ids:
|
188 |
+
flash("Nessun articolo trovato su PubMed. Modifica la query o il range di anni.", "error")
|
189 |
+
return redirect(url_for("index"))
|
190 |
+
|
191 |
pubmed_results = [fetch_pubmed_details(id_) for id_ in pubmed_ids]
|
192 |
+
total_articles = len(pubmed_ids)
|
193 |
+
valid_articles = len([r for r in pubmed_results if r is not None])
|
194 |
pubmed_results = [r for r in pubmed_results if r is not None]
|
195 |
|
196 |
+
print(f"Trovati {total_articles} articoli da PubMed. Validi dopo controllo: {valid_articles} articoli.")
|
197 |
+
|
198 |
if not pubmed_results:
|
199 |
flash("Nessun articolo PubMed valido trovato. Modifica la query o il range di anni.", "error")
|
200 |
return redirect(url_for("index"))
|
201 |
+
elif valid_articles < total_articles:
|
202 |
+
flash(f"⚠️ Trovati solo {valid_articles} articoli validi su {total_articles} richiesti.", "warning")
|
203 |
|
204 |
results = validate_document(pdf_path, [r[1] for r in pubmed_results], method="pubmed", titles=[r[0] for r in pubmed_results])
|
205 |
|
|
|
248 |
pdf.cell(0, 10, "© 2025 NORUS Tool", 0, 0, "C")
|
249 |
|
250 |
output_path = os.path.join(app.config["UPLOAD_FOLDER"], "NORUS_Report.pdf")
|
251 |
+
pdf.output(output_path, 'F')
|
252 |
|
253 |
+
return send_file(output_path, as_attachment=True)
|
254 |
|
255 |
if __name__ == "__main__":
|
256 |
+
app.run(debug=True, host="0.0.0.0", port=7860)
|