Spaces:

mabil
/

NORUS2

Sleeping

App Files Files Community

mabil commited on 11 days ago

Commit

cabbd61

1 Parent(s): 94bfde7

Fix: Improved OUI calculation and report in English, added article scoring, and optimized article selection

Browse files

Files changed (1) hide show

app.py +59 -151

app.py CHANGED Viewed

@@ -1,14 +1,13 @@
 import os
-import re
 import requests
 import pdfplumber
 from flask import Flask, render_template, request, redirect, url_for, flash, send_file
 from werkzeug.utils import secure_filename
 from sentence_transformers import SentenceTransformer, util
 from transformers import AutoTokenizer
-from fpdf import FPDF
 from collections import Counter
-from io import BytesIO
 tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
@@ -22,20 +21,6 @@ model = SentenceTransformer("allenai/scibert_scivocab_uncased")
 last_results = []
 last_common_keywords = []
-# Funzione di cleaning avanzato
-def clean_text(text):
-    boilerplate_phrases = [
-        "in recent years", "this study", "data suggest that", "in conclusion",
-        "introduction", "methods", "results", "discussion", "this review", "we aimed to",
-        "the aim of this study", "background", "objective", "methodology", "results and discussion"
-    ]
-    text = text.lower()
-    for phrase in boilerplate_phrases:
-        text = text.replace(phrase, "")
-    text = re.sub(r'\s+', ' ', text)
-    return text.strip()
-# Estrazione testo PDF
 def extract_pdf_text(pdf_path):
     text = ""
     try:
@@ -44,96 +29,51 @@ def extract_pdf_text(pdf_path):
                 text += page.extract_text() or " "
     except Exception as e:
         print(f"Errore estrazione testo: {e}")
-    return clean_text(text)
-# Funzione per estrarre la sezione "Materiali e Metodi"
-def extract_materials_and_methods(pdf_path):
-    text = extract_pdf_text(pdf_path)
-    # Supponiamo che la sezione 'Materiali e Metodi' sia identificabile da uno degli headers comuni
-    start = text.lower().find("materials and methods")
-    if start == -1:
-        return text  # Restituisce tutto il testo se non trova la sezione
-    end = text.lower().find("results", start)
-    if end == -1:
-        end = len(text)  # Fino alla fine del documento se non trova la fine della sezione
-    return text[start:end]
-# Preprocessing testo
 def preprocess_text(text):
     tokens = tokenizer.tokenize(text.lower())
     tokens = [token for token in tokens if len(token) > 3 and token.isalpha()]
-    stopwords = set([
-        "study", "data", "results", "analysis", "introduction", "conclusion",
-        "method", "methods", "objective", "discussion", "the", "and", "that", "this", "from", "with", "which"
-    ])
-    tokens = [token for token in tokens if token not in stopwords]
     return tokens
-# Calcolo token overlap
 def calculate_token_overlap(text1, text2):
     tokens1 = set(text1.split())
     tokens2 = set(text2.split())
     overlap = len(tokens1 & tokens2)
     return round((overlap / max(len(tokens1), 1)) * 100, 2)
-# Formula OUI
 def calculate_oui(similarity, token_overlap, alpha=0.7, beta=0.3):
     oui = alpha * (1 - similarity / 100) + beta * (1 - token_overlap / 100)
     result = round(oui * 100, 2)
-    if result < 0:
-        result = 0  # Limita il valore a 0 se diventa negativo
-    return result
-# Funzione di scoring degli articoli
-def score_article(article, pdf_text):
-    # Calcola la similarità semantica
-    similarity = util.pytorch_cos_sim(
-        model.encode(pdf_text, convert_to_tensor=True),
-        model.encode(article['text'], convert_to_tensor=True)
-    ).item() * 100
-    # Calcola il numero di keyword comuni
-    tokens_pdf = preprocess_text(pdf_text)
-    tokens_article = preprocess_text(article['text'])
-    common_tokens = len(set(tokens_pdf) & set(tokens_article))
-    # Punteggio complessivo (modificabile in base ai tuoi parametri)
-    score = 0.7 * similarity + 0.3 * common_tokens
-    return score
-# Validazione documento
-def validate_document(pdf_path, comparison_sources, method="local", titles=None, num_articles=10):
-    pdf_text = extract_materials_and_methods(pdf_path)
     pdf_tokens = preprocess_text(pdf_text)
     results = []
     all_keywords = []
-    # Calcolare il punteggio di ogni articolo
-    scored_articles = []
     for i, doc in enumerate(comparison_sources):
-        doc_text = extract_materials_and_methods(doc) if method == "local" else clean_text(doc)
         doc_tokens = preprocess_text(doc_text)
-        # Assegna un punteggio a ciascun articolo
-        article = {
-            'title': titles[i] if titles and i < len(titles) else os.path.basename(doc),
-            'text': doc_text
-        }
-        score = score_article(article, pdf_text)
-        if score <= 100:  # Esclude articoli con punteggi anomali
-            scored_articles.append((score, article))
-    # Ordina gli articoli in base al punteggio
-    scored_articles.sort(reverse=True, key=lambda x: x[0])
-    # Seleziona i migliori articoli in base al punteggio
-    for i in range(min(num_articles, len(scored_articles))):
-        article = scored_articles[i][1]
         results.append({
-            "title": article['title'],
-            "similarity": round(scored_articles[i][0], 2),
-            "token_overlap": round(calculate_token_overlap(" ".join(pdf_tokens), " ".join(doc_tokens)), 2),
-            "oui": calculate_oui(scored_articles[i][0], calculate_token_overlap(" ".join(pdf_tokens), " ".join(doc_tokens)))
         })
     global last_results, last_common_keywords
@@ -141,7 +81,6 @@ def validate_document(pdf_path, comparison_sources, method="local", titles=None,
     last_common_keywords = Counter(all_keywords).most_common(10)
     return results
-# Fetch dettagli articoli da PubMed
 def fetch_pubmed_details(article_id):
     base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
     params = {"db": "pubmed", "id": article_id, "retmode": "xml"}
@@ -150,62 +89,32 @@ def fetch_pubmed_details(article_id):
         response.raise_for_status()
         import xml.etree.ElementTree as ET
         root = ET.fromstring(response.text)
-        title_elem = root.find(".//ArticleTitle")
-        abstract_elem = root.find(".//AbstractText")
-        title = title_elem.text if title_elem is not None else None
-        abstract = abstract_elem.text if abstract_elem is not None else None
         keywords = root.findall(".//Keyword")
         keyword_text = " ".join([kw.text for kw in keywords if kw.text]) if keywords else ""
-        if title and abstract:
-            return title, f"{abstract} {keyword_text}"
-        else:
-            return None
     except Exception as e:
         print(f"Errore recupero abstract: {e}")
-        return None
-# Fetch ID articoli da PubMed con modalità Re-Fill
-def fetch_pubmed_re_fill(query, year_start, year_end, desired_articles=10, batch_size=20):
-    base_search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
-    all_ids = []
-    fetched_articles = []
-    start = 0
-    attempts = 0
-    max_attempts = 5
-    while len(fetched_articles) < desired_articles and attempts < max_attempts:
-        params = {
-            "db": "pubmed",
-            "term": f"{query} AND ({year_start}[PDAT] : {year_end}[PDAT])",
-            "retstart": start,
-            "retmax": batch_size,
-            "retmode": "json",
-            "sort": "relevance"
-        }
-        try:
-            response = requests.get(base_search_url, params=params)
-            response.raise_for_status()
-            batch_ids = response.json().get("esearchresult", {}).get("idlist", [])
-            if not batch_ids:
-                break
-            all_ids.extend(batch_ids)
-            start += batch_size
-            for id_ in batch_ids:
-                result = fetch_pubmed_details(id_)
-                if result:
-                    fetched_articles.append(result)
-                    if len(fetched_articles) == desired_articles:
-                        break
-        except Exception as e:
-            print(f"Errore fetch batch PubMed: {e}")
-            break
-        attempts += 1
-    return fetched_articles
 @app.route("/")
 def index():
@@ -242,18 +151,17 @@ def validate():
         year_start = request.form.get("year_start", "2000")
         year_end = request.form.get("year_end", "2025")
         num_articles = int(request.form.get("num_articles", "10"))
-        pubmed_results = fetch_pubmed_re_fill(query, year_start, year_end, desired_articles=num_articles)
-        if not pubmed_results:
-            flash("❌ Nessun articolo PubMed valido trovato. Modifica la query o il range di anni.", "error")
             return redirect(url_for("index"))
-        if len(pubmed_results) < num_articles:
-            flash(f"⚠️ Solo {len(pubmed_results)} articoli validi trovati su {num_articles} richiesti.", "warning")
-            flash("💡 Suggerimento: prova a rendere la query più generale o aumentare il range di anni.", "info")
-        results = validate_document(pdf_path, [r[1] for r in pubmed_results], method="pubmed", titles=[r[0] for r in pubmed_results])
     return render_template("NORUS.html", results=results, keywords=last_common_keywords)
@@ -266,13 +174,13 @@ def download_report():
     pdf = FPDF()
     pdf.add_page()
     pdf.set_font("Arial", "B", 16)
-    pdf.cell(0, 10, "NORUS Tool - Report Analysis", ln=True, align="C")
     pdf.ln(10)
     pdf.set_font('Arial', '', 12)
-    pdf.multi_cell(0, 10, "OUI Index = alpha(1 - similarity/100) + beta(1 - overlap/100), with alpha = 0.7 and beta = 0.3.\nLower OUI values indicate higher semantic and textual similarity.")
     pdf.ln(5)
     pdf.set_font("Arial", "B", 12)
-    pdf.cell(90, 10, "Title", 1)
     pdf.cell(30, 10, "Sim %", 1)
     pdf.cell(30, 10, "Overlap %", 1)
     pdf.cell(30, 10, "OUI", 1)
@@ -290,7 +198,7 @@ def download_report():
     if last_common_keywords:
         pdf.ln(6)
         pdf.set_font("Arial", "B", 12)
-        pdf.cell(0, 10, "Common Keywords:", ln=True)
         pdf.set_font("Arial", "", 11)
         for kw, count in last_common_keywords:
             pdf.cell(0, 10, f"- {kw} ({count})", ln=True)
@@ -300,7 +208,7 @@ def download_report():
     pdf.cell(0, 10, "© 2025 NORUS Tool", 0, 0, "C")
     output_path = os.path.join(app.config["UPLOAD_FOLDER"], "NORUS_Report.pdf")
-    pdf.output(output_path, 'F')
     return send_file(output_path, as_attachment=True)

 import os
 import requests
 import pdfplumber
 from flask import Flask, render_template, request, redirect, url_for, flash, send_file
 from werkzeug.utils import secure_filename
 from sentence_transformers import SentenceTransformer, util
 from transformers import AutoTokenizer
+from fpdf import FPDF
 from collections import Counter
+from io import BytesIO
 tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
 last_results = []
 last_common_keywords = []
 def extract_pdf_text(pdf_path):
     text = ""
     try:
                 text += page.extract_text() or " "
     except Exception as e:
         print(f"Errore estrazione testo: {e}")
+    return text.lower().strip()
 def preprocess_text(text):
     tokens = tokenizer.tokenize(text.lower())
     tokens = [token for token in tokens if len(token) > 3 and token.isalpha()]
     return tokens
 def calculate_token_overlap(text1, text2):
     tokens1 = set(text1.split())
     tokens2 = set(text2.split())
     overlap = len(tokens1 & tokens2)
     return round((overlap / max(len(tokens1), 1)) * 100, 2)
 def calculate_oui(similarity, token_overlap, alpha=0.7, beta=0.3):
     oui = alpha * (1 - similarity / 100) + beta * (1 - token_overlap / 100)
     result = round(oui * 100, 2)
+    return 0.0 if result == -0.0 else result
+def validate_document(pdf_path, comparison_sources, method="local", titles=None):
+    pdf_text = extract_pdf_text(pdf_path)
     pdf_tokens = preprocess_text(pdf_text)
     results = []
     all_keywords = []
     for i, doc in enumerate(comparison_sources):
+        doc_text = extract_pdf_text(doc) if method == "local" else doc
         doc_tokens = preprocess_text(doc_text)
+        similarity = util.pytorch_cos_sim(
+            model.encode(pdf_text, convert_to_tensor=True),
+            model.encode(doc_text, convert_to_tensor=True)
+        ).item() * 100
+        token_overlap = calculate_token_overlap(" ".join(pdf_tokens), " ".join(doc_tokens))
+        oui = calculate_oui(similarity, token_overlap)
+        title = titles[i] if titles and i < len(titles) else os.path.basename(doc) if method == "local" else "Unknown Title"
+        common_keywords = list(set(pdf_tokens) & set(doc_tokens))[:5]
+        all_keywords.extend(common_keywords)
         results.append({
+            "title": title,
+            "similarity": round(similarity, 2),
+            "token_overlap": round(token_overlap, 2),
+            "oui": round(oui, 2)
         })
     global last_results, last_common_keywords
     last_common_keywords = Counter(all_keywords).most_common(10)
     return results
 def fetch_pubmed_details(article_id):
     base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
     params = {"db": "pubmed", "id": article_id, "retmode": "xml"}
         response.raise_for_status()
         import xml.etree.ElementTree as ET
         root = ET.fromstring(response.text)
+        title = root.find(".//ArticleTitle").text if root.find(".//ArticleTitle") is not None else "No Title"
+        abstract = root.find(".//AbstractText").text if root.find(".//AbstractText") is not None else "No Abstract"
         keywords = root.findall(".//Keyword")
         keyword_text = " ".join([kw.text for kw in keywords if kw.text]) if keywords else ""
+        return title, f"{abstract} {keyword_text}"
     except Exception as e:
         print(f"Errore recupero abstract: {e}")
+        return "No Title", "No Abstract"
+def fetch_pubmed(query, year_start, year_end, max_results=10):
+    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
+    params = {
+        "db": "pubmed",
+        "term": f"{query} AND ({year_start}[PDAT] : {year_end}[PDAT])",
+        "retmax": max_results,
+        "retmode": "json",
+        "sort": "relevance"  # <-- Qui abbiamo ordinato per rilevanza
+    }
+    try:
+        response = requests.get(base_url, params=params)
+        response.raise_for_status()
+        id_list = response.json().get("esearchresult", {}).get("idlist", [])
+        return id_list
+    except Exception as e:
+        print(f"Errore fetch PubMed: {e}")
+        return []
 @app.route("/")
 def index():
         year_start = request.form.get("year_start", "2000")
         year_end = request.form.get("year_end", "2025")
         num_articles = int(request.form.get("num_articles", "10"))
+        pubmed_ids = fetch_pubmed(query, year_start, year_end, num_articles)
+        if not pubmed_ids:
+            flash("Nessun articolo trovato su PubMed per questa ricerca.", "error")
             return redirect(url_for("index"))
+        pubmed_results = [fetch_pubmed_details(id_) for id_ in pubmed_ids]
+        pubmed_texts = [r[1] for r in pubmed_results]
+        pubmed_titles = [r[0] for r in pubmed_results]
+        results = validate_document(pdf_path, pubmed_texts, method="pubmed", titles=pubmed_titles)
     return render_template("NORUS.html", results=results, keywords=last_common_keywords)
     pdf = FPDF()
     pdf.add_page()
     pdf.set_font("Arial", "B", 16)
+    pdf.cell(0, 10, "NORUS Tool - Report Analisi", ln=True, align="C")
     pdf.ln(10)
     pdf.set_font('Arial', '', 12)
+    pdf.multi_cell(0, 10, "Indice OUI = alpha(1 - sim/100) + beta(1 - overlap/100), con alpha = 0.7 e beta = 0.3.\nValori più bassi di OUI indicano maggiore similarità semantica e testuale.")
     pdf.ln(5)
     pdf.set_font("Arial", "B", 12)
+    pdf.cell(90, 10, "Titolo", 1)
     pdf.cell(30, 10, "Sim %", 1)
     pdf.cell(30, 10, "Overlap %", 1)
     pdf.cell(30, 10, "OUI", 1)
     if last_common_keywords:
         pdf.ln(6)
         pdf.set_font("Arial", "B", 12)
+        pdf.cell(0, 10, "Parole chiave comuni:", ln=True)
         pdf.set_font("Arial", "", 11)
         for kw, count in last_common_keywords:
             pdf.cell(0, 10, f"- {kw} ({count})", ln=True)
     pdf.cell(0, 10, "© 2025 NORUS Tool", 0, 0, "C")
     output_path = os.path.join(app.config["UPLOAD_FOLDER"], "NORUS_Report.pdf")
+    pdf.output(output_path, 'F')
     return send_file(output_path, as_attachment=True)