Fix: Improved OUI calculation and report in English, added article scoring, and optimized article selection
Browse files
app.py
CHANGED
@@ -1,14 +1,13 @@
|
|
1 |
import os
|
2 |
-
import re
|
3 |
import requests
|
4 |
import pdfplumber
|
5 |
from flask import Flask, render_template, request, redirect, url_for, flash, send_file
|
6 |
from werkzeug.utils import secure_filename
|
7 |
from sentence_transformers import SentenceTransformer, util
|
8 |
from transformers import AutoTokenizer
|
9 |
-
from fpdf import FPDF
|
10 |
from collections import Counter
|
11 |
-
from io import BytesIO
|
12 |
|
13 |
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
14 |
|
@@ -22,20 +21,6 @@ model = SentenceTransformer("allenai/scibert_scivocab_uncased")
|
|
22 |
last_results = []
|
23 |
last_common_keywords = []
|
24 |
|
25 |
-
# Funzione di cleaning avanzato
|
26 |
-
def clean_text(text):
|
27 |
-
boilerplate_phrases = [
|
28 |
-
"in recent years", "this study", "data suggest that", "in conclusion",
|
29 |
-
"introduction", "methods", "results", "discussion", "this review", "we aimed to",
|
30 |
-
"the aim of this study", "background", "objective", "methodology", "results and discussion"
|
31 |
-
]
|
32 |
-
text = text.lower()
|
33 |
-
for phrase in boilerplate_phrases:
|
34 |
-
text = text.replace(phrase, "")
|
35 |
-
text = re.sub(r'\s+', ' ', text)
|
36 |
-
return text.strip()
|
37 |
-
|
38 |
-
# Estrazione testo PDF
|
39 |
def extract_pdf_text(pdf_path):
|
40 |
text = ""
|
41 |
try:
|
@@ -44,96 +29,51 @@ def extract_pdf_text(pdf_path):
|
|
44 |
text += page.extract_text() or " "
|
45 |
except Exception as e:
|
46 |
print(f"Errore estrazione testo: {e}")
|
47 |
-
return
|
48 |
-
|
49 |
-
# Funzione per estrarre la sezione "Materiali e Metodi"
|
50 |
-
def extract_materials_and_methods(pdf_path):
|
51 |
-
text = extract_pdf_text(pdf_path)
|
52 |
-
# Supponiamo che la sezione 'Materiali e Metodi' sia identificabile da uno degli headers comuni
|
53 |
-
start = text.lower().find("materials and methods")
|
54 |
-
if start == -1:
|
55 |
-
return text # Restituisce tutto il testo se non trova la sezione
|
56 |
-
end = text.lower().find("results", start)
|
57 |
-
if end == -1:
|
58 |
-
end = len(text) # Fino alla fine del documento se non trova la fine della sezione
|
59 |
-
return text[start:end]
|
60 |
-
|
61 |
-
# Preprocessing testo
|
62 |
def preprocess_text(text):
|
63 |
tokens = tokenizer.tokenize(text.lower())
|
64 |
tokens = [token for token in tokens if len(token) > 3 and token.isalpha()]
|
65 |
-
stopwords = set([
|
66 |
-
"study", "data", "results", "analysis", "introduction", "conclusion",
|
67 |
-
"method", "methods", "objective", "discussion", "the", "and", "that", "this", "from", "with", "which"
|
68 |
-
])
|
69 |
-
tokens = [token for token in tokens if token not in stopwords]
|
70 |
return tokens
|
71 |
|
72 |
-
# Calcolo token overlap
|
73 |
def calculate_token_overlap(text1, text2):
|
74 |
tokens1 = set(text1.split())
|
75 |
tokens2 = set(text2.split())
|
76 |
overlap = len(tokens1 & tokens2)
|
77 |
return round((overlap / max(len(tokens1), 1)) * 100, 2)
|
78 |
|
79 |
-
# Formula OUI
|
80 |
def calculate_oui(similarity, token_overlap, alpha=0.7, beta=0.3):
|
81 |
oui = alpha * (1 - similarity / 100) + beta * (1 - token_overlap / 100)
|
82 |
result = round(oui * 100, 2)
|
83 |
-
if result
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
# Funzione di scoring degli articoli
|
88 |
-
def score_article(article, pdf_text):
|
89 |
-
# Calcola la similarità semantica
|
90 |
-
similarity = util.pytorch_cos_sim(
|
91 |
-
model.encode(pdf_text, convert_to_tensor=True),
|
92 |
-
model.encode(article['text'], convert_to_tensor=True)
|
93 |
-
).item() * 100
|
94 |
-
|
95 |
-
# Calcola il numero di keyword comuni
|
96 |
-
tokens_pdf = preprocess_text(pdf_text)
|
97 |
-
tokens_article = preprocess_text(article['text'])
|
98 |
-
common_tokens = len(set(tokens_pdf) & set(tokens_article))
|
99 |
-
|
100 |
-
# Punteggio complessivo (modificabile in base ai tuoi parametri)
|
101 |
-
score = 0.7 * similarity + 0.3 * common_tokens
|
102 |
-
return score
|
103 |
-
|
104 |
-
# Validazione documento
|
105 |
-
def validate_document(pdf_path, comparison_sources, method="local", titles=None, num_articles=10):
|
106 |
-
pdf_text = extract_materials_and_methods(pdf_path)
|
107 |
pdf_tokens = preprocess_text(pdf_text)
|
108 |
results = []
|
109 |
all_keywords = []
|
110 |
|
111 |
-
# Calcolare il punteggio di ogni articolo
|
112 |
-
scored_articles = []
|
113 |
for i, doc in enumerate(comparison_sources):
|
114 |
-
doc_text =
|
115 |
doc_tokens = preprocess_text(doc_text)
|
116 |
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
# Seleziona i migliori articoli in base al punteggio
|
130 |
-
for i in range(min(num_articles, len(scored_articles))):
|
131 |
-
article = scored_articles[i][1]
|
132 |
results.append({
|
133 |
-
"title":
|
134 |
-
"similarity": round(
|
135 |
-
"token_overlap": round(
|
136 |
-
"oui":
|
137 |
})
|
138 |
|
139 |
global last_results, last_common_keywords
|
@@ -141,7 +81,6 @@ def validate_document(pdf_path, comparison_sources, method="local", titles=None,
|
|
141 |
last_common_keywords = Counter(all_keywords).most_common(10)
|
142 |
return results
|
143 |
|
144 |
-
# Fetch dettagli articoli da PubMed
|
145 |
def fetch_pubmed_details(article_id):
|
146 |
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
|
147 |
params = {"db": "pubmed", "id": article_id, "retmode": "xml"}
|
@@ -150,62 +89,32 @@ def fetch_pubmed_details(article_id):
|
|
150 |
response.raise_for_status()
|
151 |
import xml.etree.ElementTree as ET
|
152 |
root = ET.fromstring(response.text)
|
153 |
-
|
154 |
-
|
155 |
-
title = title_elem.text if title_elem is not None else None
|
156 |
-
abstract = abstract_elem.text if abstract_elem is not None else None
|
157 |
keywords = root.findall(".//Keyword")
|
158 |
keyword_text = " ".join([kw.text for kw in keywords if kw.text]) if keywords else ""
|
159 |
-
|
160 |
-
return title, f"{abstract} {keyword_text}"
|
161 |
-
else:
|
162 |
-
return None
|
163 |
except Exception as e:
|
164 |
print(f"Errore recupero abstract: {e}")
|
165 |
-
return
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
}
|
185 |
-
try:
|
186 |
-
response = requests.get(base_search_url, params=params)
|
187 |
-
response.raise_for_status()
|
188 |
-
batch_ids = response.json().get("esearchresult", {}).get("idlist", [])
|
189 |
-
if not batch_ids:
|
190 |
-
break
|
191 |
-
|
192 |
-
all_ids.extend(batch_ids)
|
193 |
-
start += batch_size
|
194 |
-
|
195 |
-
for id_ in batch_ids:
|
196 |
-
result = fetch_pubmed_details(id_)
|
197 |
-
if result:
|
198 |
-
fetched_articles.append(result)
|
199 |
-
if len(fetched_articles) == desired_articles:
|
200 |
-
break
|
201 |
-
|
202 |
-
except Exception as e:
|
203 |
-
print(f"Errore fetch batch PubMed: {e}")
|
204 |
-
break
|
205 |
-
|
206 |
-
attempts += 1
|
207 |
-
|
208 |
-
return fetched_articles
|
209 |
|
210 |
@app.route("/")
|
211 |
def index():
|
@@ -242,18 +151,17 @@ def validate():
|
|
242 |
year_start = request.form.get("year_start", "2000")
|
243 |
year_end = request.form.get("year_end", "2025")
|
244 |
num_articles = int(request.form.get("num_articles", "10"))
|
|
|
245 |
|
246 |
-
|
247 |
-
|
248 |
-
if not pubmed_results:
|
249 |
-
flash("❌ Nessun articolo PubMed valido trovato. Modifica la query o il range di anni.", "error")
|
250 |
return redirect(url_for("index"))
|
251 |
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
|
256 |
-
results = validate_document(pdf_path,
|
257 |
|
258 |
return render_template("NORUS.html", results=results, keywords=last_common_keywords)
|
259 |
|
@@ -266,13 +174,13 @@ def download_report():
|
|
266 |
pdf = FPDF()
|
267 |
pdf.add_page()
|
268 |
pdf.set_font("Arial", "B", 16)
|
269 |
-
pdf.cell(0, 10, "NORUS Tool - Report
|
270 |
pdf.ln(10)
|
271 |
pdf.set_font('Arial', '', 12)
|
272 |
-
pdf.multi_cell(0, 10, "OUI
|
273 |
pdf.ln(5)
|
274 |
pdf.set_font("Arial", "B", 12)
|
275 |
-
pdf.cell(90, 10, "
|
276 |
pdf.cell(30, 10, "Sim %", 1)
|
277 |
pdf.cell(30, 10, "Overlap %", 1)
|
278 |
pdf.cell(30, 10, "OUI", 1)
|
@@ -290,7 +198,7 @@ def download_report():
|
|
290 |
if last_common_keywords:
|
291 |
pdf.ln(6)
|
292 |
pdf.set_font("Arial", "B", 12)
|
293 |
-
pdf.cell(0, 10, "
|
294 |
pdf.set_font("Arial", "", 11)
|
295 |
for kw, count in last_common_keywords:
|
296 |
pdf.cell(0, 10, f"- {kw} ({count})", ln=True)
|
@@ -300,7 +208,7 @@ def download_report():
|
|
300 |
pdf.cell(0, 10, "© 2025 NORUS Tool", 0, 0, "C")
|
301 |
|
302 |
output_path = os.path.join(app.config["UPLOAD_FOLDER"], "NORUS_Report.pdf")
|
303 |
-
pdf.output(output_path, 'F')
|
304 |
|
305 |
return send_file(output_path, as_attachment=True)
|
306 |
|
|
|
1 |
import os
|
|
|
2 |
import requests
|
3 |
import pdfplumber
|
4 |
from flask import Flask, render_template, request, redirect, url_for, flash, send_file
|
5 |
from werkzeug.utils import secure_filename
|
6 |
from sentence_transformers import SentenceTransformer, util
|
7 |
from transformers import AutoTokenizer
|
8 |
+
from fpdf import FPDF
|
9 |
from collections import Counter
|
10 |
+
from io import BytesIO
|
11 |
|
12 |
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
13 |
|
|
|
21 |
last_results = []
|
22 |
last_common_keywords = []
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
def extract_pdf_text(pdf_path):
|
25 |
text = ""
|
26 |
try:
|
|
|
29 |
text += page.extract_text() or " "
|
30 |
except Exception as e:
|
31 |
print(f"Errore estrazione testo: {e}")
|
32 |
+
return text.lower().strip()
|
33 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
def preprocess_text(text):
|
35 |
tokens = tokenizer.tokenize(text.lower())
|
36 |
tokens = [token for token in tokens if len(token) > 3 and token.isalpha()]
|
|
|
|
|
|
|
|
|
|
|
37 |
return tokens
|
38 |
|
|
|
39 |
def calculate_token_overlap(text1, text2):
|
40 |
tokens1 = set(text1.split())
|
41 |
tokens2 = set(text2.split())
|
42 |
overlap = len(tokens1 & tokens2)
|
43 |
return round((overlap / max(len(tokens1), 1)) * 100, 2)
|
44 |
|
|
|
45 |
def calculate_oui(similarity, token_overlap, alpha=0.7, beta=0.3):
|
46 |
oui = alpha * (1 - similarity / 100) + beta * (1 - token_overlap / 100)
|
47 |
result = round(oui * 100, 2)
|
48 |
+
return 0.0 if result == -0.0 else result
|
49 |
+
|
50 |
+
def validate_document(pdf_path, comparison_sources, method="local", titles=None):
|
51 |
+
pdf_text = extract_pdf_text(pdf_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
pdf_tokens = preprocess_text(pdf_text)
|
53 |
results = []
|
54 |
all_keywords = []
|
55 |
|
|
|
|
|
56 |
for i, doc in enumerate(comparison_sources):
|
57 |
+
doc_text = extract_pdf_text(doc) if method == "local" else doc
|
58 |
doc_tokens = preprocess_text(doc_text)
|
59 |
|
60 |
+
similarity = util.pytorch_cos_sim(
|
61 |
+
model.encode(pdf_text, convert_to_tensor=True),
|
62 |
+
model.encode(doc_text, convert_to_tensor=True)
|
63 |
+
).item() * 100
|
64 |
+
|
65 |
+
token_overlap = calculate_token_overlap(" ".join(pdf_tokens), " ".join(doc_tokens))
|
66 |
+
oui = calculate_oui(similarity, token_overlap)
|
67 |
+
title = titles[i] if titles and i < len(titles) else os.path.basename(doc) if method == "local" else "Unknown Title"
|
68 |
+
|
69 |
+
common_keywords = list(set(pdf_tokens) & set(doc_tokens))[:5]
|
70 |
+
all_keywords.extend(common_keywords)
|
71 |
+
|
|
|
|
|
|
|
72 |
results.append({
|
73 |
+
"title": title,
|
74 |
+
"similarity": round(similarity, 2),
|
75 |
+
"token_overlap": round(token_overlap, 2),
|
76 |
+
"oui": round(oui, 2)
|
77 |
})
|
78 |
|
79 |
global last_results, last_common_keywords
|
|
|
81 |
last_common_keywords = Counter(all_keywords).most_common(10)
|
82 |
return results
|
83 |
|
|
|
84 |
def fetch_pubmed_details(article_id):
|
85 |
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
|
86 |
params = {"db": "pubmed", "id": article_id, "retmode": "xml"}
|
|
|
89 |
response.raise_for_status()
|
90 |
import xml.etree.ElementTree as ET
|
91 |
root = ET.fromstring(response.text)
|
92 |
+
title = root.find(".//ArticleTitle").text if root.find(".//ArticleTitle") is not None else "No Title"
|
93 |
+
abstract = root.find(".//AbstractText").text if root.find(".//AbstractText") is not None else "No Abstract"
|
|
|
|
|
94 |
keywords = root.findall(".//Keyword")
|
95 |
keyword_text = " ".join([kw.text for kw in keywords if kw.text]) if keywords else ""
|
96 |
+
return title, f"{abstract} {keyword_text}"
|
|
|
|
|
|
|
97 |
except Exception as e:
|
98 |
print(f"Errore recupero abstract: {e}")
|
99 |
+
return "No Title", "No Abstract"
|
100 |
+
|
101 |
+
def fetch_pubmed(query, year_start, year_end, max_results=10):
|
102 |
+
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
|
103 |
+
params = {
|
104 |
+
"db": "pubmed",
|
105 |
+
"term": f"{query} AND ({year_start}[PDAT] : {year_end}[PDAT])",
|
106 |
+
"retmax": max_results,
|
107 |
+
"retmode": "json",
|
108 |
+
"sort": "relevance" # <-- Qui abbiamo ordinato per rilevanza
|
109 |
+
}
|
110 |
+
try:
|
111 |
+
response = requests.get(base_url, params=params)
|
112 |
+
response.raise_for_status()
|
113 |
+
id_list = response.json().get("esearchresult", {}).get("idlist", [])
|
114 |
+
return id_list
|
115 |
+
except Exception as e:
|
116 |
+
print(f"Errore fetch PubMed: {e}")
|
117 |
+
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
|
119 |
@app.route("/")
|
120 |
def index():
|
|
|
151 |
year_start = request.form.get("year_start", "2000")
|
152 |
year_end = request.form.get("year_end", "2025")
|
153 |
num_articles = int(request.form.get("num_articles", "10"))
|
154 |
+
pubmed_ids = fetch_pubmed(query, year_start, year_end, num_articles)
|
155 |
|
156 |
+
if not pubmed_ids:
|
157 |
+
flash("Nessun articolo trovato su PubMed per questa ricerca.", "error")
|
|
|
|
|
158 |
return redirect(url_for("index"))
|
159 |
|
160 |
+
pubmed_results = [fetch_pubmed_details(id_) for id_ in pubmed_ids]
|
161 |
+
pubmed_texts = [r[1] for r in pubmed_results]
|
162 |
+
pubmed_titles = [r[0] for r in pubmed_results]
|
163 |
|
164 |
+
results = validate_document(pdf_path, pubmed_texts, method="pubmed", titles=pubmed_titles)
|
165 |
|
166 |
return render_template("NORUS.html", results=results, keywords=last_common_keywords)
|
167 |
|
|
|
174 |
pdf = FPDF()
|
175 |
pdf.add_page()
|
176 |
pdf.set_font("Arial", "B", 16)
|
177 |
+
pdf.cell(0, 10, "NORUS Tool - Report Analisi", ln=True, align="C")
|
178 |
pdf.ln(10)
|
179 |
pdf.set_font('Arial', '', 12)
|
180 |
+
pdf.multi_cell(0, 10, "Indice OUI = alpha(1 - sim/100) + beta(1 - overlap/100), con alpha = 0.7 e beta = 0.3.\nValori più bassi di OUI indicano maggiore similarità semantica e testuale.")
|
181 |
pdf.ln(5)
|
182 |
pdf.set_font("Arial", "B", 12)
|
183 |
+
pdf.cell(90, 10, "Titolo", 1)
|
184 |
pdf.cell(30, 10, "Sim %", 1)
|
185 |
pdf.cell(30, 10, "Overlap %", 1)
|
186 |
pdf.cell(30, 10, "OUI", 1)
|
|
|
198 |
if last_common_keywords:
|
199 |
pdf.ln(6)
|
200 |
pdf.set_font("Arial", "B", 12)
|
201 |
+
pdf.cell(0, 10, "Parole chiave comuni:", ln=True)
|
202 |
pdf.set_font("Arial", "", 11)
|
203 |
for kw, count in last_common_keywords:
|
204 |
pdf.cell(0, 10, f"- {kw} ({count})", ln=True)
|
|
|
208 |
pdf.cell(0, 10, "© 2025 NORUS Tool", 0, 0, "C")
|
209 |
|
210 |
output_path = os.path.join(app.config["UPLOAD_FOLDER"], "NORUS_Report.pdf")
|
211 |
+
pdf.output(output_path, 'F')
|
212 |
|
213 |
return send_file(output_path, as_attachment=True)
|
214 |
|