import os import requests import pdfplumber import numpy as np from flask import Flask, render_template, request, redirect, url_for, flash from werkzeug.utils import secure_filename from sentence_transformers import SentenceTransformer, util import nltk from nltk.stem import WordNetLemmatizer, PorterStemmer from nltk.tokenize import word_tokenize from nltk.corpus import stopwords nltk.download("punkt") nltk.download("wordnet") nltk.download("stopwords") lemmatizer = WordNetLemmatizer() stemmer = PorterStemmer() stop_words = set(stopwords.words("english")) app = Flask(__name__) app.config["UPLOAD_FOLDER"] = "uploads" os.makedirs(app.config["UPLOAD_FOLDER"], exist_ok=True) model = SentenceTransformer("allenai/scibert_scivocab_uncased") def extract_pdf_text(pdf_path): text = "" try: with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: text += page.extract_text() or " " except Exception as e: print(f"Errore estrazione testo: {e}") return text.lower().strip() def preprocess_text(text): text = text.lower() words = word_tokenize(text) words = [stemmer.stem(lemmatizer.lemmatize(w)) for w in words if w.isalnum() and w not in stop_words] return " ".join(words) def calculate_token_overlap(text1, text2): tokens1 = set(text1.split()) tokens2 = set(text2.split()) overlap = len(tokens1 & tokens2) return round((overlap / max(len(tokens1), 1)) * 100, 2) def calculate_oui(similarity, token_overlap, alpha=0.7, beta=0.3): oui = alpha * (1 - similarity / 100) + beta * (1 - token_overlap / 100) return round(max(0, min(oui * 100, 100)), 2) def validate_document(pdf_path, comparison_sources, method="local", titles=None): pdf_text = extract_pdf_text(pdf_path) results = [] for i, doc in enumerate(comparison_sources): doc_text = extract_pdf_text(doc) if method == "local" else doc similarity = util.pytorch_cos_sim( model.encode(pdf_text, convert_to_tensor=True), model.encode(doc_text, convert_to_tensor=True) ).item() * 100 token_overlap = calculate_token_overlap(pdf_text, doc_text) oui = calculate_oui(similarity, token_overlap) title = titles[i] if titles and i < len(titles) else os.path.basename(doc) if method == "local" else "Unknown Title" results.append({"title": title, "similarity": round(similarity, 2), "token_overlap": round(token_overlap, 2), "oui": round(oui, 2)}) return results def fetch_pubmed_details(article_id): base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" params = {"db": "pubmed", "id": article_id, "retmode": "xml"} try: response = requests.get(base_url, params=params) response.raise_for_status() import xml.etree.ElementTree as ET root = ET.fromstring(response.text) title = root.find(".//ArticleTitle").text if root.find(".//ArticleTitle") is not None else "No Title" abstract = root.find(".//AbstractText").text if root.find(".//AbstractText") is not None else "No Abstract" keywords = root.findall(".//Keyword") keyword_text = " ".join([kw.text for kw in keywords if kw.text]) if keywords else "No Keywords" print(f"\nšŸ” ARTICOLO RECUPERATO\nšŸ“– Titolo: {title}\nšŸ“ Abstract: {abstract[:500]}...\nšŸ”‘ Keywords: {keyword_text}\n") return title, f"{abstract} {keyword_text}" except requests.exceptions.RequestException as e: print(f"Errore recupero abstract: {e}") return "No Title", "No Abstract" def fetch_pubmed(query, year_start, year_end, max_results=10): base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" params = {"db": "pubmed", "term": f"{query} AND ({year_start}[PDAT] : {year_end}[PDAT])", "retmax": max_results, "retmode": "json"} try: response = requests.get(base_url, params=params) response.raise_for_status() return response.json().get("esearchresult", {}).get("idlist", []) except requests.exceptions.RequestException as e: print(f"Errore recupero articoli PubMed: {e}") return [] @app.route("/") def index(): return render_template("NORUS.html") @app.route("/validate", methods=["POST"]) def validate(): pdf_file = request.files.get("pdf_file") analysis_type = request.form.get("analysis_type") local_dir = request.form.get("local_directory", "").strip() query = request.form.get("query", "").strip() if not pdf_file: flash("Carica un file PDF valido.", "error") return redirect(url_for("index")) filename = secure_filename(pdf_file.filename) pdf_path = os.path.join(app.config["UPLOAD_FOLDER"], filename) pdf_file.save(pdf_path) results = [] if analysis_type == "local": if not os.path.isdir(local_dir): flash("Seleziona una directory valida.", "error") return redirect(url_for("index")) comparison_files = [os.path.join(local_dir, f) for f in os.listdir(local_dir) if f.endswith(".pdf")] if not comparison_files: flash("La directory non contiene PDF.", "error") return redirect(url_for("index")) results = validate_document(pdf_path, comparison_files, method="local") elif analysis_type == "pubmed": year_start = request.form.get("year_start", "2000") year_end = request.form.get("year_end", "2025") num_articles = int(request.form.get("num_articles", "10")) pubmed_ids = fetch_pubmed(query, year_start, year_end, num_articles) pubmed_results = [fetch_pubmed_details(article_id) for article_id in pubmed_ids] results = validate_document(pdf_path, [result[1] for result in pubmed_results], method="pubmed", titles=[result[0] for result in pubmed_results]) return render_template("NORUS.html", results=results) if __name__ == "__main__": app.run(debug=True, port=7860)