File size: 5,970 Bytes
af53f00 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import os
import requests
import pdfplumber
import numpy as np
from flask import Flask, render_template, request, redirect, url_for, flash
from werkzeug.utils import secure_filename
from sentence_transformers import SentenceTransformer, util
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("stopwords")
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words("english"))
app = Flask(__name__)
app.config["UPLOAD_FOLDER"] = "uploads"
os.makedirs(app.config["UPLOAD_FOLDER"], exist_ok=True)
model = SentenceTransformer("allenai/scibert_scivocab_uncased")
def extract_pdf_text(pdf_path):
text = ""
try:
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text += page.extract_text() or " "
except Exception as e:
print(f"Errore estrazione testo: {e}")
return text.lower().strip()
def preprocess_text(text):
text = text.lower()
words = word_tokenize(text)
words = [stemmer.stem(lemmatizer.lemmatize(w)) for w in words if w.isalnum() and w not in stop_words]
return " ".join(words)
def calculate_token_overlap(text1, text2):
tokens1 = set(text1.split())
tokens2 = set(text2.split())
overlap = len(tokens1 & tokens2)
return round((overlap / max(len(tokens1), 1)) * 100, 2)
def calculate_oui(similarity, token_overlap, alpha=0.7, beta=0.3):
oui = alpha * (1 - similarity / 100) + beta * (1 - token_overlap / 100)
return round(max(0, min(oui * 100, 100)), 2)
def validate_document(pdf_path, comparison_sources, method="local", titles=None):
pdf_text = extract_pdf_text(pdf_path)
results = []
for i, doc in enumerate(comparison_sources):
doc_text = extract_pdf_text(doc) if method == "local" else doc
similarity = util.pytorch_cos_sim(
model.encode(pdf_text, convert_to_tensor=True),
model.encode(doc_text, convert_to_tensor=True)
).item() * 100
token_overlap = calculate_token_overlap(pdf_text, doc_text)
oui = calculate_oui(similarity, token_overlap)
title = titles[i] if titles and i < len(titles) else os.path.basename(doc) if method == "local" else "Unknown Title"
results.append({"title": title, "similarity": round(similarity, 2), "token_overlap": round(token_overlap, 2), "oui": round(oui, 2)})
return results
def fetch_pubmed_details(article_id):
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
params = {"db": "pubmed", "id": article_id, "retmode": "xml"}
try:
response = requests.get(base_url, params=params)
response.raise_for_status()
import xml.etree.ElementTree as ET
root = ET.fromstring(response.text)
title = root.find(".//ArticleTitle").text if root.find(".//ArticleTitle") is not None else "No Title"
abstract = root.find(".//AbstractText").text if root.find(".//AbstractText") is not None else "No Abstract"
keywords = root.findall(".//Keyword")
keyword_text = " ".join([kw.text for kw in keywords if kw.text]) if keywords else "No Keywords"
print(f"\nπ ARTICOLO RECUPERATO\nπ Titolo: {title}\nπ Abstract: {abstract[:500]}...\nπ Keywords: {keyword_text}\n")
return title, f"{abstract} {keyword_text}"
except requests.exceptions.RequestException as e:
print(f"Errore recupero abstract: {e}")
return "No Title", "No Abstract"
def fetch_pubmed(query, year_start, year_end, max_results=10):
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
params = {"db": "pubmed", "term": f"{query} AND ({year_start}[PDAT] : {year_end}[PDAT])", "retmax": max_results, "retmode": "json"}
try:
response = requests.get(base_url, params=params)
response.raise_for_status()
return response.json().get("esearchresult", {}).get("idlist", [])
except requests.exceptions.RequestException as e:
print(f"Errore recupero articoli PubMed: {e}")
return []
@app.route("/")
def index():
return render_template("NORUS.html")
@app.route("/validate", methods=["POST"])
def validate():
pdf_file = request.files.get("pdf_file")
analysis_type = request.form.get("analysis_type")
local_dir = request.form.get("local_directory", "").strip()
query = request.form.get("query", "").strip()
if not pdf_file:
flash("Carica un file PDF valido.", "error")
return redirect(url_for("index"))
filename = secure_filename(pdf_file.filename)
pdf_path = os.path.join(app.config["UPLOAD_FOLDER"], filename)
pdf_file.save(pdf_path)
results = []
if analysis_type == "local":
if not os.path.isdir(local_dir):
flash("Seleziona una directory valida.", "error")
return redirect(url_for("index"))
comparison_files = [os.path.join(local_dir, f) for f in os.listdir(local_dir) if f.endswith(".pdf")]
if not comparison_files:
flash("La directory non contiene PDF.", "error")
return redirect(url_for("index"))
results = validate_document(pdf_path, comparison_files, method="local")
elif analysis_type == "pubmed":
year_start = request.form.get("year_start", "2000")
year_end = request.form.get("year_end", "2025")
num_articles = int(request.form.get("num_articles", "10"))
pubmed_ids = fetch_pubmed(query, year_start, year_end, num_articles)
pubmed_results = [fetch_pubmed_details(article_id) for article_id in pubmed_ids]
results = validate_document(pdf_path, [result[1] for result in pubmed_results], method="pubmed", titles=[result[0] for result in pubmed_results])
return render_template("NORUS.html", results=results)
if __name__ == "__main__":
app.run(debug=True, port=7860)
|