File size: 5,970 Bytes
af53f00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import os
import requests
import pdfplumber
import numpy as np
from flask import Flask, render_template, request, redirect, url_for, flash
from werkzeug.utils import secure_filename
from sentence_transformers import SentenceTransformer, util
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download("punkt")
nltk.download("wordnet")
nltk.download("stopwords")

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words("english"))

app = Flask(__name__)
app.config["UPLOAD_FOLDER"] = "uploads"
os.makedirs(app.config["UPLOAD_FOLDER"], exist_ok=True)

model = SentenceTransformer("allenai/scibert_scivocab_uncased")

def extract_pdf_text(pdf_path):
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text() or " "
    except Exception as e:
        print(f"Errore estrazione testo: {e}")
    return text.lower().strip()

def preprocess_text(text):
    text = text.lower()
    words = word_tokenize(text)
    words = [stemmer.stem(lemmatizer.lemmatize(w)) for w in words if w.isalnum() and w not in stop_words]
    return " ".join(words)

def calculate_token_overlap(text1, text2):
    tokens1 = set(text1.split())
    tokens2 = set(text2.split())
    overlap = len(tokens1 & tokens2)
    return round((overlap / max(len(tokens1), 1)) * 100, 2)

def calculate_oui(similarity, token_overlap, alpha=0.7, beta=0.3):
    oui = alpha * (1 - similarity / 100) + beta * (1 - token_overlap / 100)
    return round(max(0, min(oui * 100, 100)), 2)

def validate_document(pdf_path, comparison_sources, method="local", titles=None):
    pdf_text = extract_pdf_text(pdf_path)
    results = []
    for i, doc in enumerate(comparison_sources):
        doc_text = extract_pdf_text(doc) if method == "local" else doc
        similarity = util.pytorch_cos_sim(
            model.encode(pdf_text, convert_to_tensor=True),
            model.encode(doc_text, convert_to_tensor=True)
        ).item() * 100
        token_overlap = calculate_token_overlap(pdf_text, doc_text)
        oui = calculate_oui(similarity, token_overlap)
        title = titles[i] if titles and i < len(titles) else os.path.basename(doc) if method == "local" else "Unknown Title"
        results.append({"title": title, "similarity": round(similarity, 2), "token_overlap": round(token_overlap, 2), "oui": round(oui, 2)})
    return results

def fetch_pubmed_details(article_id):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {"db": "pubmed", "id": article_id, "retmode": "xml"}
    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        import xml.etree.ElementTree as ET
        root = ET.fromstring(response.text)
        title = root.find(".//ArticleTitle").text if root.find(".//ArticleTitle") is not None else "No Title"
        abstract = root.find(".//AbstractText").text if root.find(".//AbstractText") is not None else "No Abstract"
        keywords = root.findall(".//Keyword")
        keyword_text = " ".join([kw.text for kw in keywords if kw.text]) if keywords else "No Keywords"
        print(f"\nπŸ” ARTICOLO RECUPERATO\nπŸ“– Titolo: {title}\nπŸ“ Abstract: {abstract[:500]}...\nπŸ”‘ Keywords: {keyword_text}\n")
        return title, f"{abstract} {keyword_text}"
    except requests.exceptions.RequestException as e:
        print(f"Errore recupero abstract: {e}")
        return "No Title", "No Abstract"

def fetch_pubmed(query, year_start, year_end, max_results=10):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {"db": "pubmed", "term": f"{query} AND ({year_start}[PDAT] : {year_end}[PDAT])", "retmax": max_results, "retmode": "json"}
    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        return response.json().get("esearchresult", {}).get("idlist", [])
    except requests.exceptions.RequestException as e:
        print(f"Errore recupero articoli PubMed: {e}")
        
        return []

@app.route("/")
def index():
    return render_template("NORUS.html")

@app.route("/validate", methods=["POST"])
def validate():
    pdf_file = request.files.get("pdf_file")
    analysis_type = request.form.get("analysis_type")
    local_dir = request.form.get("local_directory", "").strip()
    query = request.form.get("query", "").strip()
    if not pdf_file:
        flash("Carica un file PDF valido.", "error")
        return redirect(url_for("index"))
    filename = secure_filename(pdf_file.filename)
    pdf_path = os.path.join(app.config["UPLOAD_FOLDER"], filename)
    pdf_file.save(pdf_path)
    results = []
    if analysis_type == "local":
        if not os.path.isdir(local_dir):
            flash("Seleziona una directory valida.", "error")
            return redirect(url_for("index"))
        comparison_files = [os.path.join(local_dir, f) for f in os.listdir(local_dir) if f.endswith(".pdf")]
        if not comparison_files:
            flash("La directory non contiene PDF.", "error")
            return redirect(url_for("index"))
        results = validate_document(pdf_path, comparison_files, method="local")
    elif analysis_type == "pubmed":
        year_start = request.form.get("year_start", "2000")
        year_end = request.form.get("year_end", "2025")
        num_articles = int(request.form.get("num_articles", "10"))
        pubmed_ids = fetch_pubmed(query, year_start, year_end, num_articles)
        pubmed_results = [fetch_pubmed_details(article_id) for article_id in pubmed_ids]
        results = validate_document(pdf_path, [result[1] for result in pubmed_results], method="pubmed", titles=[result[0] for result in pubmed_results])
    return render_template("NORUS.html", results=results)

if __name__ == "__main__":
    app.run(debug=True, port=7860)