Fix: Improved OUI calculation and report in English, added article scoring, and optimized article selection
Browse files
app.py
CHANGED
@@ -47,40 +47,6 @@ def calculate_oui(similarity, token_overlap, alpha=0.7, beta=0.3):
|
|
47 |
result = round(oui * 100, 2)
|
48 |
return 0.0 if result == -0.0 else result
|
49 |
|
50 |
-
def validate_document(pdf_path, comparison_sources, method="local", titles=None):
|
51 |
-
pdf_text = extract_pdf_text(pdf_path)
|
52 |
-
pdf_tokens = preprocess_text(pdf_text)
|
53 |
-
results = []
|
54 |
-
all_keywords = []
|
55 |
-
|
56 |
-
for i, doc in enumerate(comparison_sources):
|
57 |
-
doc_text = extract_pdf_text(doc) if method == "local" else doc
|
58 |
-
doc_tokens = preprocess_text(doc_text)
|
59 |
-
|
60 |
-
similarity = util.pytorch_cos_sim(
|
61 |
-
model.encode(pdf_text, convert_to_tensor=True),
|
62 |
-
model.encode(doc_text, convert_to_tensor=True)
|
63 |
-
).item() * 100
|
64 |
-
|
65 |
-
token_overlap = calculate_token_overlap(" ".join(pdf_tokens), " ".join(doc_tokens))
|
66 |
-
oui = calculate_oui(similarity, token_overlap)
|
67 |
-
title = titles[i] if titles and i < len(titles) else os.path.basename(doc) if method == "local" else "Unknown Title"
|
68 |
-
|
69 |
-
common_keywords = list(set(pdf_tokens) & set(doc_tokens))[:5]
|
70 |
-
all_keywords.extend(common_keywords)
|
71 |
-
|
72 |
-
results.append({
|
73 |
-
"title": title,
|
74 |
-
"similarity": round(similarity, 2),
|
75 |
-
"token_overlap": round(token_overlap, 2),
|
76 |
-
"oui": round(oui, 2)
|
77 |
-
})
|
78 |
-
|
79 |
-
global last_results, last_common_keywords
|
80 |
-
last_results = results
|
81 |
-
last_common_keywords = Counter(all_keywords).most_common(10)
|
82 |
-
return results
|
83 |
-
|
84 |
def fetch_pubmed_details(article_id):
|
85 |
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
|
86 |
params = {"db": "pubmed", "id": article_id, "retmode": "xml"}
|
@@ -97,9 +63,9 @@ def fetch_pubmed_details(article_id):
|
|
97 |
title = title_elem.text if title_elem is not None else None
|
98 |
abstract = abstract_elem.text if abstract_elem is not None else None
|
99 |
|
|
|
100 |
if not title or not abstract:
|
101 |
-
#
|
102 |
-
return "No Title", "No Abstract"
|
103 |
|
104 |
# Recupero delle parole chiave
|
105 |
keywords = root.findall(".//Keyword")
|
@@ -108,7 +74,7 @@ def fetch_pubmed_details(article_id):
|
|
108 |
return title, f"{abstract} {keyword_text}"
|
109 |
except Exception as e:
|
110 |
print(f"Errore recupero abstract: {e}")
|
111 |
-
return
|
112 |
|
113 |
def fetch_pubmed(query, year_start, year_end, max_results=10):
|
114 |
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
|
@@ -128,6 +94,43 @@ def fetch_pubmed(query, year_start, year_end, max_results=10):
|
|
128 |
print(f"Errore fetch PubMed: {e}")
|
129 |
return []
|
130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
@app.route("/")
|
132 |
def index():
|
133 |
return render_template("NORUS.html")
|
@@ -166,10 +169,16 @@ def validate():
|
|
166 |
pubmed_ids = fetch_pubmed(query, year_start, year_end, num_articles)
|
167 |
|
168 |
if not pubmed_ids:
|
169 |
-
flash("
|
170 |
return redirect(url_for("index"))
|
171 |
|
172 |
pubmed_results = [fetch_pubmed_details(id_) for id_ in pubmed_ids]
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
pubmed_texts = [r[1] for r in pubmed_results]
|
174 |
pubmed_titles = [r[0] for r in pubmed_results]
|
175 |
|
@@ -225,4 +234,4 @@ def download_report():
|
|
225 |
return send_file(output_path, as_attachment=True)
|
226 |
|
227 |
if __name__ == "__main__":
|
228 |
-
app.run(debug=True, host="0.0.0.0", port=7860)
|
|
|
47 |
result = round(oui * 100, 2)
|
48 |
return 0.0 if result == -0.0 else result
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
def fetch_pubmed_details(article_id):
|
51 |
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
|
52 |
params = {"db": "pubmed", "id": article_id, "retmode": "xml"}
|
|
|
63 |
title = title_elem.text if title_elem is not None else None
|
64 |
abstract = abstract_elem.text if abstract_elem is not None else None
|
65 |
|
66 |
+
# Se manca titolo o abstract, ignoriamo questo articolo
|
67 |
if not title or not abstract:
|
68 |
+
return None # Restituisce None se non ci sono titolo o abstract
|
|
|
69 |
|
70 |
# Recupero delle parole chiave
|
71 |
keywords = root.findall(".//Keyword")
|
|
|
74 |
return title, f"{abstract} {keyword_text}"
|
75 |
except Exception as e:
|
76 |
print(f"Errore recupero abstract: {e}")
|
77 |
+
return None # Restituisce None se c'è un errore nella richiesta
|
78 |
|
79 |
def fetch_pubmed(query, year_start, year_end, max_results=10):
|
80 |
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
|
|
|
94 |
print(f"Errore fetch PubMed: {e}")
|
95 |
return []
|
96 |
|
97 |
+
def validate_document(pdf_path, comparison_sources, method="local", titles=None):
|
98 |
+
pdf_text = extract_pdf_text(pdf_path)
|
99 |
+
pdf_tokens = preprocess_text(pdf_text)
|
100 |
+
results = []
|
101 |
+
all_keywords = []
|
102 |
+
|
103 |
+
for i, doc in enumerate(comparison_sources):
|
104 |
+
if not doc: # Salta gli articoli invalidi
|
105 |
+
continue
|
106 |
+
|
107 |
+
doc_text = extract_pdf_text(doc) if method == "local" else doc
|
108 |
+
doc_tokens = preprocess_text(doc_text)
|
109 |
+
|
110 |
+
similarity = util.pytorch_cos_sim(
|
111 |
+
model.encode(pdf_text, convert_to_tensor=True),
|
112 |
+
model.encode(doc_text, convert_to_tensor=True)
|
113 |
+
).item() * 100
|
114 |
+
|
115 |
+
token_overlap = calculate_token_overlap(" ".join(pdf_tokens), " ".join(doc_tokens))
|
116 |
+
oui = calculate_oui(similarity, token_overlap)
|
117 |
+
title = titles[i] if titles and i < len(titles) else os.path.basename(doc) if method == "local" else "Unknown Title"
|
118 |
+
|
119 |
+
common_keywords = list(set(pdf_tokens) & set(doc_tokens))[:5]
|
120 |
+
all_keywords.extend(common_keywords)
|
121 |
+
|
122 |
+
results.append({
|
123 |
+
"title": title,
|
124 |
+
"similarity": round(similarity, 2),
|
125 |
+
"token_overlap": round(token_overlap, 2),
|
126 |
+
"oui": round(oui, 2)
|
127 |
+
})
|
128 |
+
|
129 |
+
global last_results, last_common_keywords
|
130 |
+
last_results = results
|
131 |
+
last_common_keywords = Counter(all_keywords).most_common(10)
|
132 |
+
return results
|
133 |
+
|
134 |
@app.route("/")
|
135 |
def index():
|
136 |
return render_template("NORUS.html")
|
|
|
169 |
pubmed_ids = fetch_pubmed(query, year_start, year_end, num_articles)
|
170 |
|
171 |
if not pubmed_ids:
|
172 |
+
flash("Nessun articolo trovato su PubMed per questa ricerca.", "error")
|
173 |
return redirect(url_for("index"))
|
174 |
|
175 |
pubmed_results = [fetch_pubmed_details(id_) for id_ in pubmed_ids]
|
176 |
+
# Filtrare gli articoli che non hanno titolo o abstract
|
177 |
+
pubmed_results = [result for result in pubmed_results if result is not None]
|
178 |
+
if not pubmed_results:
|
179 |
+
flash("Nessun articolo valido trovato da PubMed.", "error")
|
180 |
+
return redirect(url_for("index"))
|
181 |
+
|
182 |
pubmed_texts = [r[1] for r in pubmed_results]
|
183 |
pubmed_titles = [r[0] for r in pubmed_results]
|
184 |
|
|
|
234 |
return send_file(output_path, as_attachment=True)
|
235 |
|
236 |
if __name__ == "__main__":
|
237 |
+
app.run(debug=True, host="0.0.0.0", port=7860)
|