import os import requests import pdfplumber from flask import Flask, render_template, request, redirect, url_for, flash, send_file from werkzeug.utils import secure_filename from sentence_transformers import SentenceTransformer, util from transformers import AutoTokenizer from fpdf import FPDF from collections import Counter from io import BytesIO tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') app = Flask(__name__) app.secret_key = os.environ.get("SECRET_KEY", "NORUS_secretkey_05") app.config["UPLOAD_FOLDER"] = "uploads" os.makedirs(app.config["UPLOAD_FOLDER"], exist_ok=True) model = SentenceTransformer("allenai/scibert_scivocab_uncased") last_results = [] last_common_keywords = [] def extract_pdf_text(pdf_path): text = "" try: with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: text += page.extract_text() or " " except Exception as e: print(f"Errore estrazione testo: {e}") return text.lower().strip() def preprocess_text(text): tokens = tokenizer.tokenize(text.lower()) tokens = [token for token in tokens if len(token) > 3 and token.isalpha()] return tokens def calculate_token_overlap(text1, text2): tokens1 = set(text1.split()) tokens2 = set(text2.split()) overlap = len(tokens1 & tokens2) return round((overlap / max(len(tokens1), 1)) * 100, 2) def calculate_oui(similarity, token_overlap, alpha=0.7, beta=0.3): oui = alpha * (1 - similarity / 100) + beta * (1 - token_overlap / 100) result = round(oui * 100, 2) return 0.0 if result == -0.0 else result def validate_document(pdf_path, comparison_sources, method="local", titles=None): pdf_text = extract_pdf_text(pdf_path) pdf_tokens = preprocess_text(pdf_text) results = [] all_keywords = [] for i, doc in enumerate(comparison_sources): doc_text = extract_pdf_text(doc) if method == "local" else doc doc_tokens = preprocess_text(doc_text) similarity = util.pytorch_cos_sim( model.encode(pdf_text, convert_to_tensor=True), model.encode(doc_text, convert_to_tensor=True) ).item() * 100 token_overlap = calculate_token_overlap(" ".join(pdf_tokens), " ".join(doc_tokens)) oui = calculate_oui(similarity, token_overlap) title = titles[i] if titles and i < len(titles) else os.path.basename(doc) if method == "local" else "Unknown Title" common_keywords = list(set(pdf_tokens) & set(doc_tokens))[:5] all_keywords.extend(common_keywords) results.append({ "title": title, "similarity": round(similarity, 2), "token_overlap": round(token_overlap, 2), "oui": round(oui, 2) }) global last_results, last_common_keywords last_results = results last_common_keywords = Counter(all_keywords).most_common(10) return results def fetch_pubmed_details(article_id): base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" params = {"db": "pubmed", "id": article_id, "retmode": "xml"} try: response = requests.get(base_url, params=params) response.raise_for_status() import xml.etree.ElementTree as ET root = ET.fromstring(response.text) title = root.find(".//ArticleTitle").text if root.find(".//ArticleTitle") is not None else "No Title" abstract = root.find(".//AbstractText").text if root.find(".//AbstractText") is not None else "No Abstract" keywords = root.findall(".//Keyword") keyword_text = " ".join([kw.text for kw in keywords if kw.text]) if keywords else "" return title, f"{abstract} {keyword_text}" except Exception as e: print(f"Errore recupero abstract: {e}") return None # Restituisci None se si verifica un errore def fetch_pubmed(query, year_start, year_end, max_results=10): base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" params = { "db": "pubmed", "term": f"{query} AND ({year_start}[PDAT] : {year_end}[PDAT])", "retmax": max_results, "retmode": "json", "sort": "relevance" # <-- Qui abbiamo ordinato per rilevanza } try: response = requests.get(base_url, params=params) response.raise_for_status() id_list = response.json().get("esearchresult", {}).get("idlist", []) return id_list except Exception as e: print(f"Errore fetch PubMed: {e}") return [] @app.route("/") def index(): return render_template("NORUS.html") @app.route("/validate", methods=["POST"]) def validate(): pdf_file = request.files.get("pdf_file") analysis_type = request.form.get("analysis_type") query = request.form.get("query", "").strip() if not pdf_file: flash("Carica un file PDF valido.", "error") return redirect(url_for("index")) filename = secure_filename(pdf_file.filename) pdf_path = os.path.join(app.config["UPLOAD_FOLDER"], filename) pdf_file.save(pdf_path) if analysis_type == "local": comparison_files = request.files.getlist("comparison_files") saved_paths = [] for file in comparison_files: if file and file.filename.endswith(".pdf"): fname = secure_filename(file.filename) path = os.path.join(app.config["UPLOAD_FOLDER"], fname) file.save(path) saved_paths.append(path) if not saved_paths: flash("Nessun file di confronto caricato.", "error") return redirect(url_for("index")) results = validate_document(pdf_path, saved_paths, method="local") else: year_start = request.form.get("year_start", "2000") year_end = request.form.get("year_end", "2025") num_articles = int(request.form.get("num_articles", "10")) pubmed_ids = fetch_pubmed(query, year_start, year_end, num_articles) if not pubmed_ids: flash("❌ Nessun articolo trovato su PubMed per questa ricerca.", "error") return redirect(url_for("index")) pubmed_results = [] for id_ in pubmed_ids: result = fetch_pubmed_details(id_) if result: # Aggiungi solo se il risultato non è None pubmed_results.append(result) # Ora puoi accedere a pubmed_results senza errori pubmed_texts = [r[1] for r in pubmed_results] # Estrai i testi pubmed_titles = [r[0] for r in pubmed_results] # Estrai i titoli results = validate_document(pdf_path, pubmed_texts, method="pubmed", titles=pubmed_titles) return render_template("NORUS.html", results=results, keywords=last_common_keywords) @app.route("/download_report", methods=["POST"]) def download_report(): if not last_results: flash("Nessun risultato da esportare.", "error") return redirect(url_for("index")) pdf = FPDF() pdf.add_page() pdf.set_font("Arial", "B", 16) pdf.cell(0, 10, "NORUS Tool - Report Analisi", ln=True, align="C") pdf.ln(10) pdf.set_font('Arial', '', 12) pdf.multi_cell(0, 10, "Indice OUI = alpha(1 - sim/100) + beta(1 - overlap/100), con alpha = 0.7 e beta = 0.3.\nValori più bassi di OUI indicano maggiore similarità semantica e testuale.") pdf.ln(5) pdf.set_font("Arial", "B", 12) pdf.cell(90, 10, "Titolo", 1) pdf.cell(30, 10, "Sim %", 1) pdf.cell(30, 10, "Overlap %", 1) pdf.cell(30, 10, "OUI", 1) pdf.ln() pdf.set_font("Arial", "", 11) for res in last_results: title = res["title"][:40] + "..." if len(res["title"]) > 43 else res["title"] pdf.cell(90, 10, title, 1) pdf.cell(30, 10, str(res["similarity"]), 1) pdf.cell(30, 10, str(res["token_overlap"]), 1) pdf.cell(30, 10, str(res["oui"]), 1) pdf.ln() if last_common_keywords: pdf.ln(6) pdf.set_font("Arial", "B", 12) pdf.cell(0, 10, "Parole chiave comuni:", ln=True) pdf.set_font("Arial", "", 11) for kw, count in last_common_keywords: pdf.cell(0, 10, f"- {kw} ({count})", ln=True) pdf.set_y(-20) pdf.set_font("Arial", "I", 9) pdf.cell(0, 10, "© 2025 NORUS Tool", 0, 0, "C") output_path = os.path.join(app.config["UPLOAD_FOLDER"], "NORUS_Report.pdf") pdf.output(output_path, 'F') return send_file(output_path, as_attachment=True) if __name__ == "__main__": app.run(debug=True, host="0.0.0.0", port=7860)