Spaces:
Sleeping
Sleeping
import os | |
import requests | |
import pdfplumber | |
from flask import Flask, render_template, request, redirect, url_for, flash, send_file | |
from werkzeug.utils import secure_filename | |
from sentence_transformers import SentenceTransformer, util | |
from transformers import AutoTokenizer | |
from fpdf import FPDF # Usa fpdf per evitare errori con unicode | |
from collections import Counter | |
from io import BytesIO # Importa BytesIO per generare PDF in memoria | |
# Usa Hugging Face tokenizer | |
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') | |
app = Flask(__name__) | |
app.secret_key = os.environ.get("SECRET_KEY", "NORUS_secretkey_05") | |
app.config["UPLOAD_FOLDER"] = "uploads" | |
os.makedirs(app.config["UPLOAD_FOLDER"], exist_ok=True) | |
model = SentenceTransformer("allenai/scibert_scivocab_uncased") | |
last_results = [] | |
last_common_keywords = [] | |
def extract_pdf_text(pdf_path): | |
text = "" | |
try: | |
with pdfplumber.open(pdf_path) as pdf: | |
for page in pdf.pages: | |
text += page.extract_text() or " " | |
except Exception as e: | |
print(f"Errore estrazione testo: {e}") | |
return text.lower().strip() | |
def preprocess_text(text): | |
# Tokenizza il testo usando il tokenizer di Hugging Face | |
tokens = tokenizer.tokenize(text.lower()) | |
# Filtra le parole per mantenere solo quelle significative (eliminando numeri, simboli non scientifici, ecc.) | |
tokens = [token for token in tokens if len(token) > 3 and token.isalpha()] | |
return tokens | |
def calculate_token_overlap(text1, text2): | |
tokens1 = set(text1.split()) | |
tokens2 = set(text2.split()) | |
overlap = len(tokens1 & tokens2) | |
return round((overlap / max(len(tokens1), 1)) * 100, 2) | |
def calculate_oui(similarity, token_overlap, alpha=0.7, beta=0.3): | |
oui = alpha * (1 - similarity / 100) + beta * (1 - token_overlap / 100) | |
result = round(oui * 100, 2) | |
return 0.0 if result == -0.0 else result | |
def validate_document(pdf_path, comparison_sources, method="local", titles=None): | |
pdf_text = extract_pdf_text(pdf_path) | |
pdf_tokens = preprocess_text(pdf_text) | |
results = [] | |
all_keywords = [] | |
for i, doc in enumerate(comparison_sources): | |
doc_text = extract_pdf_text(doc) if method == "local" else doc | |
doc_tokens = preprocess_text(doc_text) | |
similarity = util.pytorch_cos_sim( | |
model.encode(pdf_text, convert_to_tensor=True), | |
model.encode(doc_text, convert_to_tensor=True) | |
).item() * 100 | |
token_overlap = calculate_token_overlap(" ".join(pdf_tokens), " ".join(doc_tokens)) | |
oui = calculate_oui(similarity, token_overlap) | |
title = titles[i] if titles and i < len(titles) else os.path.basename(doc) if method == "local" else "Unknown Title" | |
common_keywords = list(set(pdf_tokens) & set(doc_tokens))[:5] | |
all_keywords.extend(common_keywords) | |
results.append({ | |
"title": title, | |
"similarity": round(similarity, 2), | |
"token_overlap": round(token_overlap, 2), | |
"oui": round(oui, 2) | |
}) | |
global last_results, last_common_keywords | |
last_results = results | |
last_common_keywords = Counter(all_keywords).most_common(10) | |
return results | |
def fetch_pubmed_details(article_id): | |
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" | |
params = {"db": "pubmed", "id": article_id, "retmode": "xml"} | |
try: | |
response = requests.get(base_url, params=params) | |
response.raise_for_status() | |
import xml.etree.ElementTree as ET | |
root = ET.fromstring(response.text) | |
title = root.find(".//ArticleTitle").text or "No Title" | |
abstract = root.find(".//AbstractText").text or "No Abstract" | |
keywords = root.findall(".//Keyword") | |
keyword_text = " ".join([kw.text for kw in keywords if kw.text]) if keywords else "" | |
return title, f"{abstract} {keyword_text}" | |
except Exception as e: | |
print(f"Errore recupero abstract: {e}") | |
return "No Title", "No Abstract" | |
def fetch_pubmed(query, year_start, year_end, max_results=10): | |
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" | |
params = { | |
"db": "pubmed", | |
"term": f"{query} AND ({year_start}[PDAT] : {year_end}[PDAT])", | |
"retmax": max_results, | |
"retmode": "json" | |
} | |
try: | |
response = requests.get(base_url, params=params) | |
response.raise_for_status() | |
return response.json().get("esearchresult", {}).get("idlist", []) | |
except Exception as e: | |
print(f"Errore fetch PubMed: {e}") | |
return [] | |
def index(): | |
return render_template("NORUS.html") | |
def validate(): | |
pdf_file = request.files.get("pdf_file") | |
analysis_type = request.form.get("analysis_type") | |
query = request.form.get("query", "").strip() | |
if not pdf_file: | |
flash("Carica un file PDF valido.", "error") | |
return redirect(url_for("index")) | |
filename = secure_filename(pdf_file.filename) | |
pdf_path = os.path.join(app.config["UPLOAD_FOLDER"], filename) | |
pdf_file.save(pdf_path) | |
if analysis_type == "local": | |
comparison_files = request.files.getlist("comparison_files") | |
saved_paths = [] | |
for file in comparison_files: | |
if file and file.filename.endswith(".pdf"): | |
fname = secure_filename(file.filename) | |
path = os.path.join(app.config["UPLOAD_FOLDER"], fname) | |
file.save(path) | |
saved_paths.append(path) | |
if not saved_paths: | |
flash("Nessun file di confronto caricato.", "error") | |
return redirect(url_for("index")) | |
results = validate_document(pdf_path, saved_paths, method="local") | |
else: | |
year_start = request.form.get("year_start", "2000") | |
year_end = request.form.get("year_end", "2025") | |
num_articles = int(request.form.get("num_articles", "10")) | |
pubmed_ids = fetch_pubmed(query, year_start, year_end, num_articles) | |
pubmed_results = [fetch_pubmed_details(id_) for id_ in pubmed_ids] | |
results = validate_document(pdf_path, [r[1] for r in pubmed_results], method="pubmed", titles=[r[0] for r in pubmed_results]) | |
return render_template("NORUS.html", results=results, keywords=last_common_keywords) | |
def download_report(): | |
if not last_results: | |
flash("Nessun risultato da esportare.", "error") | |
return redirect(url_for("index")) | |
pdf = FPDF() | |
pdf.add_page() | |
pdf.set_font("Arial", "B", 16) | |
pdf.cell(0, 10, "NORUS Tool - Report Analisi", ln=True, align="C") | |
pdf.ln(10) | |
# Usa un font per il testo con i simboli in formato ASCII | |
pdf.set_font('Arial', '', 12) | |
pdf.multi_cell(0, 10, "Indice OUI = alpha(1 - sim/100) + beta(1 - overlap/100), con alpha = 0.7 e beta = 0.3.\nValori più bassi di OUI indicano maggiore similarità semantica e testuale.") | |
pdf.ln(5) | |
pdf.set_font("Arial", "B", 12) | |
pdf.cell(90, 10, "Titolo", 1) | |
pdf.cell(30, 10, "Sim %", 1) | |
pdf.cell(30, 10, "Overlap %", 1) | |
pdf.cell(30, 10, "OUI", 1) | |
pdf.ln() | |
pdf.set_font("Arial", "", 11) | |
for res in last_results: | |
title = res["title"][:40] + "..." if len(res["title"]) > 43 else res["title"] | |
pdf.cell(90, 10, title, 1) | |
pdf.cell(30, 10, str(res["similarity"]), 1) | |
pdf.cell(30, 10, str(res["token_overlap"]), 1) | |
pdf.cell(30, 10, str(res["oui"]), 1) | |
pdf.ln() | |
if last_common_keywords: | |
pdf.ln(6) | |
pdf.set_font("Arial", "B", 12) | |
pdf.cell(0, 10, "Parole chiave comuni:", ln=True) | |
pdf.set_font("Arial", "", 11) | |
for kw, count in last_common_keywords: | |
pdf.cell(0, 10, f"- {kw} ({count})", ln=True) | |
pdf.set_y(-20) | |
pdf.set_font("Arial", "I", 9) | |
pdf.cell(0, 10, "© 2025 NORUS Tool", 0, 0, "C") | |
# Salva il PDF nella cartella uploads | |
output_path = os.path.join(app.config["UPLOAD_FOLDER"], "NORUS_Report.pdf") | |
pdf.output(output_path, 'F') # Salva il file sul disco | |
return send_file(output_path, as_attachment=True) # Forza il download del file PDF | |
if __name__ == "__main__": | |
app.run(debug=True, host="0.0.0.0", port=7860) | |