Spaces:

mabil
/

NORUS2

Running

App Files Files Community

mabil commited on 17 days ago

Commit

088b011

1 Parent(s): 8b3c01e

Fix: Improved OUI calculation and report in English, added article scoring, and optimized article selection

Browse files

Files changed (1) hide show

app.py +48 -39

app.py CHANGED Viewed

@@ -47,40 +47,6 @@ def calculate_oui(similarity, token_overlap, alpha=0.7, beta=0.3):
     result = round(oui * 100, 2)
     return 0.0 if result == -0.0 else result
-def validate_document(pdf_path, comparison_sources, method="local", titles=None):
-    pdf_text = extract_pdf_text(pdf_path)
-    pdf_tokens = preprocess_text(pdf_text)
-    results = []
-    all_keywords = []
-    for i, doc in enumerate(comparison_sources):
-        doc_text = extract_pdf_text(doc) if method == "local" else doc
-        doc_tokens = preprocess_text(doc_text)
-        similarity = util.pytorch_cos_sim(
-            model.encode(pdf_text, convert_to_tensor=True),
-            model.encode(doc_text, convert_to_tensor=True)
-        ).item() * 100
-        token_overlap = calculate_token_overlap(" ".join(pdf_tokens), " ".join(doc_tokens))
-        oui = calculate_oui(similarity, token_overlap)
-        title = titles[i] if titles and i < len(titles) else os.path.basename(doc) if method == "local" else "Unknown Title"
-        common_keywords = list(set(pdf_tokens) & set(doc_tokens))[:5]
-        all_keywords.extend(common_keywords)
-        results.append({
-            "title": title,
-            "similarity": round(similarity, 2),
-            "token_overlap": round(token_overlap, 2),
-            "oui": round(oui, 2)
-        })
-    global last_results, last_common_keywords
-    last_results = results
-    last_common_keywords = Counter(all_keywords).most_common(10)
-    return results
 def fetch_pubmed_details(article_id):
     base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
     params = {"db": "pubmed", "id": article_id, "retmode": "xml"}
@@ -97,9 +63,9 @@ def fetch_pubmed_details(article_id):
         title = title_elem.text if title_elem is not None else None
         abstract = abstract_elem.text if abstract_elem is not None else None
         if not title or not abstract:
-            # Se manca titolo o abstract, restituisci un messaggio chiaro
-            return "No Title", "No Abstract"
         # Recupero delle parole chiave
         keywords = root.findall(".//Keyword")
@@ -108,7 +74,7 @@ def fetch_pubmed_details(article_id):
         return title, f"{abstract} {keyword_text}"
     except Exception as e:
         print(f"Errore recupero abstract: {e}")
-        return "No Title", "No Abstract"
 def fetch_pubmed(query, year_start, year_end, max_results=10):
     base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
@@ -128,6 +94,43 @@ def fetch_pubmed(query, year_start, year_end, max_results=10):
         print(f"Errore fetch PubMed: {e}")
         return []
 @app.route("/")
 def index():
     return render_template("NORUS.html")
@@ -166,10 +169,16 @@ def validate():
         pubmed_ids = fetch_pubmed(query, year_start, year_end, num_articles)
         if not pubmed_ids:
-            flash("❌ Nessun articolo trovato su PubMed per questa ricerca.", "error")
             return redirect(url_for("index"))
         pubmed_results = [fetch_pubmed_details(id_) for id_ in pubmed_ids]
         pubmed_texts = [r[1] for r in pubmed_results]
         pubmed_titles = [r[0] for r in pubmed_results]
@@ -225,4 +234,4 @@ def download_report():
     return send_file(output_path, as_attachment=True)
 if __name__ == "__main__":
-    app.run(debug=True, host="0.0.0.0", port=7860)

     result = round(oui * 100, 2)
     return 0.0 if result == -0.0 else result
 def fetch_pubmed_details(article_id):
     base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
     params = {"db": "pubmed", "id": article_id, "retmode": "xml"}
         title = title_elem.text if title_elem is not None else None
         abstract = abstract_elem.text if abstract_elem is not None else None
+        # Se manca titolo o abstract, ignoriamo questo articolo
         if not title or not abstract:
+            return None  # Restituisce None se non ci sono titolo o abstract
         # Recupero delle parole chiave
         keywords = root.findall(".//Keyword")
         return title, f"{abstract} {keyword_text}"
     except Exception as e:
         print(f"Errore recupero abstract: {e}")
+        return None  # Restituisce None se c'è un errore nella richiesta
 def fetch_pubmed(query, year_start, year_end, max_results=10):
     base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
         print(f"Errore fetch PubMed: {e}")
         return []
+def validate_document(pdf_path, comparison_sources, method="local", titles=None):
+    pdf_text = extract_pdf_text(pdf_path)
+    pdf_tokens = preprocess_text(pdf_text)
+    results = []
+    all_keywords = []
+    for i, doc in enumerate(comparison_sources):
+        if not doc:  # Salta gli articoli invalidi
+            continue
+        doc_text = extract_pdf_text(doc) if method == "local" else doc
+        doc_tokens = preprocess_text(doc_text)
+        similarity = util.pytorch_cos_sim(
+            model.encode(pdf_text, convert_to_tensor=True),
+            model.encode(doc_text, convert_to_tensor=True)
+        ).item() * 100
+        token_overlap = calculate_token_overlap(" ".join(pdf_tokens), " ".join(doc_tokens))
+        oui = calculate_oui(similarity, token_overlap)
+        title = titles[i] if titles and i < len(titles) else os.path.basename(doc) if method == "local" else "Unknown Title"
+        common_keywords = list(set(pdf_tokens) & set(doc_tokens))[:5]
+        all_keywords.extend(common_keywords)
+        results.append({
+            "title": title,
+            "similarity": round(similarity, 2),
+            "token_overlap": round(token_overlap, 2),
+            "oui": round(oui, 2)
+        })
+    global last_results, last_common_keywords
+    last_results = results
+    last_common_keywords = Counter(all_keywords).most_common(10)
+    return results
 @app.route("/")
 def index():
     return render_template("NORUS.html")
         pubmed_ids = fetch_pubmed(query, year_start, year_end, num_articles)
         if not pubmed_ids:
+            flash("Nessun articolo trovato su PubMed per questa ricerca.", "error")
             return redirect(url_for("index"))
         pubmed_results = [fetch_pubmed_details(id_) for id_ in pubmed_ids]
+        # Filtrare gli articoli che non hanno titolo o abstract
+        pubmed_results = [result for result in pubmed_results if result is not None]
+        if not pubmed_results:
+            flash("Nessun articolo valido trovato da PubMed.", "error")
+            return redirect(url_for("index"))
         pubmed_texts = [r[1] for r in pubmed_results]
         pubmed_titles = [r[0] for r in pubmed_results]
     return send_file(output_path, as_attachment=True)
 if __name__ == "__main__":
+    app.run(debug=True, host="0.0.0.0", port=7860)