Spaces:

mabil
/

NORUS2

Running

App Files Files Community

mabil commited on 24 days ago

Commit

300a457

1 Parent(s): db5493b

Major improvement: semantic and token cleaning

Browse files

Files changed (1) hide show

app.py +54 -19

app.py CHANGED Viewed

@@ -1,13 +1,14 @@
 import os
 import requests
 import pdfplumber
 from flask import Flask, render_template, request, redirect, url_for, flash, send_file
 from werkzeug.utils import secure_filename
 from sentence_transformers import SentenceTransformer, util
 from transformers import AutoTokenizer
-from fpdf import FPDF
 from collections import Counter
-from io import BytesIO
 tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
@@ -21,6 +22,20 @@ model = SentenceTransformer("allenai/scibert_scivocab_uncased")
 last_results = []
 last_common_keywords = []
 def extract_pdf_text(pdf_path):
     text = ""
     try:
@@ -29,24 +44,34 @@ def extract_pdf_text(pdf_path):
                 text += page.extract_text() or " "
     except Exception as e:
         print(f"Errore estrazione testo: {e}")
-    return text.lower().strip()
 def preprocess_text(text):
     tokens = tokenizer.tokenize(text.lower())
     tokens = [token for token in tokens if len(token) > 3 and token.isalpha()]
     return tokens
 def calculate_token_overlap(text1, text2):
     tokens1 = set(text1.split())
     tokens2 = set(text2.split())
     overlap = len(tokens1 & tokens2)
     return round((overlap / max(len(tokens1), 1)) * 100, 2)
 def calculate_oui(similarity, token_overlap, alpha=0.7, beta=0.3):
     oui = alpha * (1 - similarity / 100) + beta * (1 - token_overlap / 100)
     result = round(oui * 100, 2)
     return 0.0 if result == -0.0 else result
 def validate_document(pdf_path, comparison_sources, method="local", titles=None):
     pdf_text = extract_pdf_text(pdf_path)
     pdf_tokens = preprocess_text(pdf_text)
@@ -54,7 +79,7 @@ def validate_document(pdf_path, comparison_sources, method="local", titles=None)
     all_keywords = []
     for i, doc in enumerate(comparison_sources):
-        doc_text = extract_pdf_text(doc) if method == "local" else doc
         doc_tokens = preprocess_text(doc_text)
         similarity = util.pytorch_cos_sim(
@@ -81,6 +106,7 @@ def validate_document(pdf_path, comparison_sources, method="local", titles=None)
     last_common_keywords = Counter(all_keywords).most_common(10)
     return results
 def fetch_pubmed_details(article_id):
     base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
     params = {"db": "pubmed", "id": article_id, "retmode": "xml"}
@@ -89,23 +115,21 @@ def fetch_pubmed_details(article_id):
         response.raise_for_status()
         import xml.etree.ElementTree as ET
         root = ET.fromstring(response.text)
-        title_element = root.find(".//ArticleTitle")
-        abstract_element = root.find(".//AbstractText")
-        title = title_element.text.strip() if title_element is not None and title_element.text else "No Title"
-        abstract = abstract_element.text.strip() if abstract_element is not None and abstract_element.text else "No Abstract"
         keywords = root.findall(".//Keyword")
         keyword_text = " ".join([kw.text for kw in keywords if kw.text]) if keywords else ""
-        if title == "No Title" or abstract == "No Abstract":
-            return None  # Se titolo o abstract mancano, scarta questo articolo
-        return title, f"{abstract} {keyword_text}"
     except Exception as e:
         print(f"Errore recupero abstract: {e}")
         return None
 def fetch_pubmed(query, year_start, year_end, max_results=10):
     base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
     params = {
@@ -113,7 +137,7 @@ def fetch_pubmed(query, year_start, year_end, max_results=10):
         "term": f"{query} AND ({year_start}[PDAT] : {year_end}[PDAT])",
         "retmax": max_results,
         "retmode": "json",
-        "sort": "relevance"
     }
     try:
         response = requests.get(base_url, params=params)
@@ -159,12 +183,23 @@ def validate():
         year_end = request.form.get("year_end", "2025")
         num_articles = int(request.form.get("num_articles", "10"))
         pubmed_ids = fetch_pubmed(query, year_start, year_end, num_articles)
         pubmed_results = [fetch_pubmed_details(id_) for id_ in pubmed_ids]
         pubmed_results = [r for r in pubmed_results if r is not None]
         if not pubmed_results:
             flash("Nessun articolo PubMed valido trovato. Modifica la query o il range di anni.", "error")
             return redirect(url_for("index"))
         results = validate_document(pdf_path, [r[1] for r in pubmed_results], method="pubmed", titles=[r[0] for r in pubmed_results])
@@ -213,9 +248,9 @@ def download_report():
     pdf.cell(0, 10, "© 2025 NORUS Tool", 0, 0, "C")
     output_path = os.path.join(app.config["UPLOAD_FOLDER"], "NORUS_Report.pdf")
-    pdf.output(output_path, 'F')
-    return send_file(output_path, as_attachment=True)
 if __name__ == "__main__":
-    app.run(debug=True, host="0.0.0.0", port=7860)

 import os
+import re
 import requests
 import pdfplumber
 from flask import Flask, render_template, request, redirect, url_for, flash, send_file
 from werkzeug.utils import secure_filename
 from sentence_transformers import SentenceTransformer, util
 from transformers import AutoTokenizer
+from fpdf import FPDF
 from collections import Counter
+from io import BytesIO
 tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
 last_results = []
 last_common_keywords = []
+# Funzione di cleaning avanzato
+def clean_text(text):
+    boilerplate_phrases = [
+        "in recent years", "this study", "data suggest that", "in conclusion",
+        "introduction", "methods", "results", "discussion", "this review", "we aimed to",
+        "the aim of this study", "background", "objective", "methodology", "results and discussion"
+    ]
+    text = text.lower()
+    for phrase in boilerplate_phrases:
+        text = text.replace(phrase, "")
+    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
+    return text.strip()
+# Funzione di estrazione testo PDF
 def extract_pdf_text(pdf_path):
     text = ""
     try:
                 text += page.extract_text() or " "
     except Exception as e:
         print(f"Errore estrazione testo: {e}")
+    return clean_text(text)
+# Preprocesso dei token
 def preprocess_text(text):
     tokens = tokenizer.tokenize(text.lower())
     tokens = [token for token in tokens if len(token) > 3 and token.isalpha()]
+    # Lista minimale di stopwords scientifiche comuni
+    stopwords = set([
+        "study", "data", "results", "analysis", "introduction", "conclusion",
+        "method", "methods", "objective", "discussion", "the", "and", "that", "this", "from", "with", "which"
+    ])
+    tokens = [token for token in tokens if token not in stopwords]
     return tokens
+# Calcolo token overlap migliorato
 def calculate_token_overlap(text1, text2):
     tokens1 = set(text1.split())
     tokens2 = set(text2.split())
     overlap = len(tokens1 & tokens2)
     return round((overlap / max(len(tokens1), 1)) * 100, 2)
+# Formula OUI aggiornata
 def calculate_oui(similarity, token_overlap, alpha=0.7, beta=0.3):
     oui = alpha * (1 - similarity / 100) + beta * (1 - token_overlap / 100)
     result = round(oui * 100, 2)
     return 0.0 if result == -0.0 else result
+# Validazione documento
 def validate_document(pdf_path, comparison_sources, method="local", titles=None):
     pdf_text = extract_pdf_text(pdf_path)
     pdf_tokens = preprocess_text(pdf_text)
     all_keywords = []
     for i, doc in enumerate(comparison_sources):
+        doc_text = extract_pdf_text(doc) if method == "local" else clean_text(doc)
         doc_tokens = preprocess_text(doc_text)
         similarity = util.pytorch_cos_sim(
     last_common_keywords = Counter(all_keywords).most_common(10)
     return results
+# Fetch dettagli articoli da PubMed
 def fetch_pubmed_details(article_id):
     base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
     params = {"db": "pubmed", "id": article_id, "retmode": "xml"}
         response.raise_for_status()
         import xml.etree.ElementTree as ET
         root = ET.fromstring(response.text)
+        title_elem = root.find(".//ArticleTitle")
+        abstract_elem = root.find(".//AbstractText")
+        title = title_elem.text if title_elem is not None else None
+        abstract = abstract_elem.text if abstract_elem is not None else None
         keywords = root.findall(".//Keyword")
         keyword_text = " ".join([kw.text for kw in keywords if kw.text]) if keywords else ""
+        if title and abstract:
+            return title, f"{abstract} {keyword_text}"
+        else:
+            return None
     except Exception as e:
         print(f"Errore recupero abstract: {e}")
         return None
+# Fetch ID articoli da PubMed
 def fetch_pubmed(query, year_start, year_end, max_results=10):
     base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
     params = {
         "term": f"{query} AND ({year_start}[PDAT] : {year_end}[PDAT])",
         "retmax": max_results,
         "retmode": "json",
+        "sort": "relevance"  # Importantissimo: ordina per rilevanza
     }
     try:
         response = requests.get(base_url, params=params)
         year_end = request.form.get("year_end", "2025")
         num_articles = int(request.form.get("num_articles", "10"))
         pubmed_ids = fetch_pubmed(query, year_start, year_end, num_articles)
+        if not pubmed_ids:
+            flash("Nessun articolo trovato su PubMed. Modifica la query o il range di anni.", "error")
+            return redirect(url_for("index"))
         pubmed_results = [fetch_pubmed_details(id_) for id_ in pubmed_ids]
+        total_articles = len(pubmed_ids)
+        valid_articles = len([r for r in pubmed_results if r is not None])
         pubmed_results = [r for r in pubmed_results if r is not None]
+        print(f"Trovati {total_articles} articoli da PubMed. Validi dopo controllo: {valid_articles} articoli.")
         if not pubmed_results:
             flash("Nessun articolo PubMed valido trovato. Modifica la query o il range di anni.", "error")
             return redirect(url_for("index"))
+        elif valid_articles < total_articles:
+            flash(f"⚠️ Trovati solo {valid_articles} articoli validi su {total_articles} richiesti.", "warning")
         results = validate_document(pdf_path, [r[1] for r in pubmed_results], method="pubmed", titles=[r[0] for r in pubmed_results])
     pdf.cell(0, 10, "© 2025 NORUS Tool", 0, 0, "C")
     output_path = os.path.join(app.config["UPLOAD_FOLDER"], "NORUS_Report.pdf")
+    pdf.output(output_path, 'F')
+    return send_file(output_path, as_attachment=True)
 if __name__ == "__main__":
+    app.run(debug=True, host="0.0.0.0", port=7860)