Spaces:
Sleeping
Sleeping
Commit
Β·
8c42b67
0
Parent(s):
π Deploy NORUS tool completo su Hugging Face senza binary files
Browse files- .gitattributes +1 -0
- .gitignore +6 -0
- Dockerfile +13 -0
- README.md +71 -0
- app.py +133 -0
- models/similarity_model.py +32 -0
- requirements.txt +12 -0
- start_local.sh +8 -0
- static/css/style.css +200 -0
- static/js/script.js +119 -0
- templates/NORUS.html +173 -0
- templates/app.py +134 -0
.gitattributes
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
static/images/norus.png filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
*.pyc
|
3 |
+
.DS_Store
|
4 |
+
venv/
|
5 |
+
uploads/
|
6 |
+
static/images/norus.png
|
Dockerfile
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9
|
2 |
+
|
3 |
+
RUN useradd -m -u 1000 user
|
4 |
+
USER user
|
5 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
6 |
+
|
7 |
+
WORKDIR /app
|
8 |
+
|
9 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
10 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
11 |
+
|
12 |
+
COPY --chown=user . /app
|
13 |
+
CMD ["python", "app.py"]
|
README.md
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# NORUS Tool π§ π
|
2 |
+
|
3 |
+
**NORUS** (Novelty and Originality Recognition Utility System) Γ¨ uno strumento basato su intelligenza artificiale che consente l'analisi semantica di articoli scientifici in formato PDF, confrontandoli con articoli locali o pubblicati su PubMed. Il tool restituisce misure di **similaritΓ semantica**, **token overlap** e un indice composito chiamato **OUI (Originality & Uniqueness Index)**.
|
4 |
+
|
5 |
+
## π FunzionalitΓ principali
|
6 |
+
|
7 |
+
- β
Caricamento PDF da analizzare
|
8 |
+
- π Confronto con PDF locali o articoli da PubMed
|
9 |
+
- π€ Estrazione di embedding semantici tramite SciBERT
|
10 |
+
- π Calcolo di:
|
11 |
+
- SimilaritΓ semantica (cosine similarity)
|
12 |
+
- Sovrapposizione testuale (token overlap)
|
13 |
+
- Indice OUI (originalitΓ e novitΓ )
|
14 |
+
- π Visualizzazione interattiva dei risultati via Chart.js
|
15 |
+
|
16 |
+
## π§ͺ OUI - Originality & Uniqueness Index
|
17 |
+
|
18 |
+
\`\`\`math
|
19 |
+
OUI = 1 - (Ξ± Γ semantic_similarity + Ξ² Γ token_overlap)
|
20 |
+
\`\`\`
|
21 |
+
|
22 |
+
- Ξ± = 0.7 β penalizza la somiglianza semantica
|
23 |
+
- Ξ² = 0.3 β penalizza la ripetizione letterale
|
24 |
+
- L'OUI misura **quanto un documento Γ¨ originale**, sia nel contenuto che nella forma.
|
25 |
+
|
26 |
+
## π§± Architettura
|
27 |
+
|
28 |
+
- `Flask` come backend web
|
29 |
+
- `pdfplumber` per l'estrazione del testo dai PDF
|
30 |
+
- `nltk` per preprocessing linguistico
|
31 |
+
- `sentence-transformers` con modello `allenai/scibert_scivocab_uncased`
|
32 |
+
- `requests` per l'interfaccia con PubMed
|
33 |
+
|
34 |
+
## π Struttura del progetto
|
35 |
+
|
36 |
+
```
|
37 |
+
.
|
38 |
+
βββ app.py
|
39 |
+
βββ Dockerfile
|
40 |
+
βββ requirements.txt
|
41 |
+
βββ static/
|
42 |
+
βββ templates/
|
43 |
+
βββ uploads/
|
44 |
+
βββ README.md
|
45 |
+
```
|
46 |
+
|
47 |
+
## βΆοΈ Esecuzione locale
|
48 |
+
|
49 |
+
Per eseguire localmente:
|
50 |
+
|
51 |
+
1. Assicurati di avere Python 3.9+
|
52 |
+
2. Installa le dipendenze:
|
53 |
+
|
54 |
+
\`\`\`bash
|
55 |
+
pip install -r requirements.txt
|
56 |
+
\`\`\`
|
57 |
+
|
58 |
+
3. Avvia l'app:
|
59 |
+
|
60 |
+
\`\`\`bash
|
61 |
+
python app.py
|
62 |
+
\`\`\`
|
63 |
+
|
64 |
+
Apri il browser su `http://localhost:7860`
|
65 |
+
|
66 |
+
## π‘ Deploy su Hugging Face Spaces
|
67 |
+
|
68 |
+
Puoi caricare questo progetto come Space Docker-based su Hugging Face. Il `Dockerfile` Γ¨ giΓ configurato.
|
69 |
+
|
70 |
+
---
|
71 |
+
Developed by Marina Bilotta β Computational Chemistry & AI Research
|
app.py
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import requests
|
3 |
+
import pdfplumber
|
4 |
+
import numpy as np
|
5 |
+
from flask import Flask, render_template, request, redirect, url_for, flash
|
6 |
+
from werkzeug.utils import secure_filename
|
7 |
+
from sentence_transformers import SentenceTransformer, util
|
8 |
+
import nltk
|
9 |
+
from nltk.stem import WordNetLemmatizer, PorterStemmer
|
10 |
+
from nltk.tokenize import word_tokenize
|
11 |
+
from nltk.corpus import stopwords
|
12 |
+
|
13 |
+
nltk.download("punkt")
|
14 |
+
nltk.download("wordnet")
|
15 |
+
nltk.download("stopwords")
|
16 |
+
|
17 |
+
lemmatizer = WordNetLemmatizer()
|
18 |
+
stemmer = PorterStemmer()
|
19 |
+
stop_words = set(stopwords.words("english"))
|
20 |
+
|
21 |
+
app = Flask(__name__)
|
22 |
+
app.config["UPLOAD_FOLDER"] = "uploads"
|
23 |
+
os.makedirs(app.config["UPLOAD_FOLDER"], exist_ok=True)
|
24 |
+
|
25 |
+
model = SentenceTransformer("allenai/scibert_scivocab_uncased")
|
26 |
+
|
27 |
+
def extract_pdf_text(pdf_path):
|
28 |
+
text = ""
|
29 |
+
try:
|
30 |
+
with pdfplumber.open(pdf_path) as pdf:
|
31 |
+
for page in pdf.pages:
|
32 |
+
text += page.extract_text() or " "
|
33 |
+
except Exception as e:
|
34 |
+
print(f"Errore estrazione testo: {e}")
|
35 |
+
return text.lower().strip()
|
36 |
+
|
37 |
+
def preprocess_text(text):
|
38 |
+
text = text.lower()
|
39 |
+
words = word_tokenize(text)
|
40 |
+
words = [stemmer.stem(lemmatizer.lemmatize(w)) for w in words if w.isalnum() and w not in stop_words]
|
41 |
+
return " ".join(words)
|
42 |
+
|
43 |
+
def calculate_token_overlap(text1, text2):
|
44 |
+
tokens1 = set(text1.split())
|
45 |
+
tokens2 = set(text2.split())
|
46 |
+
overlap = len(tokens1 & tokens2)
|
47 |
+
return round((overlap / max(len(tokens1), 1)) * 100, 2)
|
48 |
+
|
49 |
+
def calculate_oui(similarity, token_overlap, alpha=0.7, beta=0.3):
|
50 |
+
oui = alpha * (1 - similarity / 100) + beta * (1 - token_overlap / 100)
|
51 |
+
return round(max(0, min(oui * 100, 100)), 2)
|
52 |
+
|
53 |
+
def validate_document(pdf_path, comparison_sources, method="local", titles=None):
|
54 |
+
pdf_text = extract_pdf_text(pdf_path)
|
55 |
+
results = []
|
56 |
+
for i, doc in enumerate(comparison_sources):
|
57 |
+
doc_text = extract_pdf_text(doc) if method == "local" else doc
|
58 |
+
similarity = util.pytorch_cos_sim(
|
59 |
+
model.encode(pdf_text, convert_to_tensor=True),
|
60 |
+
model.encode(doc_text, convert_to_tensor=True)
|
61 |
+
).item() * 100
|
62 |
+
token_overlap = calculate_token_overlap(pdf_text, doc_text)
|
63 |
+
oui = calculate_oui(similarity, token_overlap)
|
64 |
+
title = titles[i] if titles and i < len(titles) else os.path.basename(doc) if method == "local" else "Unknown Title"
|
65 |
+
results.append({"title": title, "similarity": round(similarity, 2), "token_overlap": round(token_overlap, 2), "oui": round(oui, 2)})
|
66 |
+
return results
|
67 |
+
|
68 |
+
def fetch_pubmed_details(article_id):
|
69 |
+
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
|
70 |
+
params = {"db": "pubmed", "id": article_id, "retmode": "xml"}
|
71 |
+
try:
|
72 |
+
response = requests.get(base_url, params=params)
|
73 |
+
response.raise_for_status()
|
74 |
+
import xml.etree.ElementTree as ET
|
75 |
+
root = ET.fromstring(response.text)
|
76 |
+
title = root.find(".//ArticleTitle").text if root.find(".//ArticleTitle") is not None else "No Title"
|
77 |
+
abstract = root.find(".//AbstractText").text if root.find(".//AbstractText") is not None else "No Abstract"
|
78 |
+
keywords = root.findall(".//Keyword")
|
79 |
+
keyword_text = " ".join([kw.text for kw in keywords if kw.text]) if keywords else "No Keywords"
|
80 |
+
print(f"\nπ ARTICOLO RECUPERATO\nπ Titolo: {title}\nπ Abstract: {abstract[:500]}...\nπ Keywords: {keyword_text}\n")
|
81 |
+
return title, f"{abstract} {keyword_text}"
|
82 |
+
except requests.exceptions.RequestException as e:
|
83 |
+
print(f"Errore recupero abstract: {e}")
|
84 |
+
return "No Title", "No Abstract"
|
85 |
+
|
86 |
+
def fetch_pubmed(query, year_start, year_end, max_results=10):
|
87 |
+
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
|
88 |
+
params = {"db": "pubmed", "term": f"{query} AND ({year_start}[PDAT] : {year_end}[PDAT])", "retmax": max_results, "retmode": "json"}
|
89 |
+
try:
|
90 |
+
response = requests.get(base_url, params=params)
|
91 |
+
response.raise_for_status()
|
92 |
+
return response.json().get("esearchresult", {}).get("idlist", [])
|
93 |
+
except requests.exceptions.RequestException as e:
|
94 |
+
print(f"Errore recupero articoli PubMed: {e}")
|
95 |
+
return []
|
96 |
+
|
97 |
+
@app.route("/")
|
98 |
+
def index():
|
99 |
+
return render_template("NORUS.html")
|
100 |
+
|
101 |
+
@app.route("/validate", methods=["POST"])
|
102 |
+
def validate():
|
103 |
+
pdf_file = request.files.get("pdf_file")
|
104 |
+
analysis_type = request.form.get("analysis_type")
|
105 |
+
local_dir = request.form.get("local_directory", "").strip()
|
106 |
+
query = request.form.get("query", "").strip()
|
107 |
+
if not pdf_file:
|
108 |
+
flash("Carica un file PDF valido.", "error")
|
109 |
+
return redirect(url_for("index"))
|
110 |
+
filename = secure_filename(pdf_file.filename)
|
111 |
+
pdf_path = os.path.join(app.config["UPLOAD_FOLDER"], filename)
|
112 |
+
pdf_file.save(pdf_path)
|
113 |
+
results = []
|
114 |
+
if analysis_type == "local":
|
115 |
+
if not os.path.isdir(local_dir):
|
116 |
+
flash("Seleziona una directory valida.", "error")
|
117 |
+
return redirect(url_for("index"))
|
118 |
+
comparison_files = [os.path.join(local_dir, f) for f in os.listdir(local_dir) if f.endswith(".pdf")]
|
119 |
+
if not comparison_files:
|
120 |
+
flash("La directory non contiene PDF.", "error")
|
121 |
+
return redirect(url_for("index"))
|
122 |
+
results = validate_document(pdf_path, comparison_files, method="local")
|
123 |
+
elif analysis_type == "pubmed":
|
124 |
+
year_start = request.form.get("year_start", "2000")
|
125 |
+
year_end = request.form.get("year_end", "2025")
|
126 |
+
num_articles = int(request.form.get("num_articles", "10"))
|
127 |
+
pubmed_ids = fetch_pubmed(query, year_start, year_end, num_articles)
|
128 |
+
pubmed_results = [fetch_pubmed_details(article_id) for article_id in pubmed_ids]
|
129 |
+
results = validate_document(pdf_path, [result[1] for result in pubmed_results], method="pubmed", titles=[result[0] for result in pubmed_results])
|
130 |
+
return render_template("NORUS.html", results=results)
|
131 |
+
|
132 |
+
if __name__ == "__main__":
|
133 |
+
app.run(debug=True, host="0.0.0.0", port=7860)
|
models/similarity_model.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
2 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
3 |
+
|
4 |
+
def compute_similarity(text1, text2):
|
5 |
+
"""
|
6 |
+
Calcola la similaritΓ del coseno tra due testi usando TF-IDF.
|
7 |
+
|
8 |
+
Parametri:
|
9 |
+
text1 (str): Primo testo.
|
10 |
+
text2 (str): Secondo testo.
|
11 |
+
|
12 |
+
Ritorna:
|
13 |
+
float: Valore di similaritΓ (0 a 1).
|
14 |
+
"""
|
15 |
+
try:
|
16 |
+
# Verifica che i testi non siano vuoti
|
17 |
+
if not text1.strip() or not text2.strip():
|
18 |
+
raise ValueError("Uno o entrambi i testi sono vuoti.")
|
19 |
+
|
20 |
+
# Vettorizzazione con TF-IDF
|
21 |
+
vectorizer = TfidfVectorizer(stop_words='english')
|
22 |
+
tfidf_matrix = vectorizer.fit_transform([text1, text2])
|
23 |
+
|
24 |
+
# Calcolo della similaritΓ del coseno
|
25 |
+
similarity_matrix = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
|
26 |
+
|
27 |
+
return similarity_matrix[0][0] # Ritorna il valore della similaritΓ
|
28 |
+
|
29 |
+
except Exception as e:
|
30 |
+
print(f"Errore durante il calcolo della similaritΓ : {e}")
|
31 |
+
return None
|
32 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
flask
|
2 |
+
pdfplumber
|
3 |
+
nltk
|
4 |
+
sentence-transformers
|
5 |
+
scikit-learn
|
6 |
+
pandas
|
7 |
+
reportlab
|
8 |
+
matplotlib
|
9 |
+
requests
|
10 |
+
keybert
|
11 |
+
torch
|
12 |
+
transformers
|
start_local.sh
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
echo "βοΈ Avvio dell'ambiente NORUS..."
|
3 |
+
python3 -m venv venv
|
4 |
+
source venv/bin/activate
|
5 |
+
pip install --upgrade pip
|
6 |
+
pip install -r requirements.txt
|
7 |
+
echo "β
Ambiente pronto. Avvio del server Flask..."
|
8 |
+
python app.py
|
static/css/style.css
ADDED
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/* Generale */
|
2 |
+
html, body {
|
3 |
+
height: 100%;
|
4 |
+
margin: 0;
|
5 |
+
padding: 0;
|
6 |
+
overflow-y: auto; /* Permette lo scrolling */
|
7 |
+
}
|
8 |
+
|
9 |
+
body {
|
10 |
+
font-family: Arial, sans-serif;
|
11 |
+
background-color: #f8f8f8;
|
12 |
+
color: #333;
|
13 |
+
display: flex;
|
14 |
+
flex-direction: column;
|
15 |
+
min-height: 100vh;
|
16 |
+
}
|
17 |
+
|
18 |
+
/* Header */
|
19 |
+
header {
|
20 |
+
background-color: rgba(42, 77, 111, 0.8); /* Aggiunge trasparenza */
|
21 |
+
color: #fff;
|
22 |
+
padding: 20px;
|
23 |
+
text-align: center;
|
24 |
+
position: relative;
|
25 |
+
}
|
26 |
+
|
27 |
+
header h1 {
|
28 |
+
margin-bottom: 10px;
|
29 |
+
font-size: 2em;
|
30 |
+
}
|
31 |
+
|
32 |
+
header p {
|
33 |
+
font-size: 1.2em;
|
34 |
+
margin-top: 0;
|
35 |
+
}
|
36 |
+
|
37 |
+
/* Logo */
|
38 |
+
#logo {
|
39 |
+
max-width: 200px;
|
40 |
+
cursor: pointer;
|
41 |
+
transition: transform 0.3s ease;
|
42 |
+
}
|
43 |
+
|
44 |
+
#logo:hover {
|
45 |
+
transform: scale(1.2);
|
46 |
+
}
|
47 |
+
|
48 |
+
/* Container principale */
|
49 |
+
.container {
|
50 |
+
flex: 1;
|
51 |
+
width: 100%;
|
52 |
+
display: flex;
|
53 |
+
flex-direction: column;
|
54 |
+
align-items: center;
|
55 |
+
padding: 20px;
|
56 |
+
}
|
57 |
+
|
58 |
+
/* Form */
|
59 |
+
form {
|
60 |
+
margin: 20px auto;
|
61 |
+
text-align: center;
|
62 |
+
width: 80%;
|
63 |
+
max-width: 800px;
|
64 |
+
padding: 25px;
|
65 |
+
background-color: #fff;
|
66 |
+
border-radius: 10px;
|
67 |
+
box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1);
|
68 |
+
overflow-y: auto; /* Permette scrolling interno se necessario */
|
69 |
+
max-height: 90vh; /* Impedisce che il form esca dallo schermo */
|
70 |
+
}
|
71 |
+
|
72 |
+
/* Etichette e campi input */
|
73 |
+
label {
|
74 |
+
display: block;
|
75 |
+
font-size: 1.1em;
|
76 |
+
margin-bottom: 8px;
|
77 |
+
font-weight: bold;
|
78 |
+
}
|
79 |
+
|
80 |
+
input, select {
|
81 |
+
width: 100%;
|
82 |
+
padding: 10px;
|
83 |
+
margin-bottom: 15px;
|
84 |
+
border: 1px solid #ccc;
|
85 |
+
border-radius: 5px;
|
86 |
+
font-size: 1.1em;
|
87 |
+
}
|
88 |
+
|
89 |
+
/* Pulsanti */
|
90 |
+
button {
|
91 |
+
width: 100%;
|
92 |
+
background-color: #2a4d6f;
|
93 |
+
color: #fff;
|
94 |
+
padding: 12px 20px;
|
95 |
+
border: none;
|
96 |
+
border-radius: 5px;
|
97 |
+
cursor: pointer;
|
98 |
+
font-size: 1.2em;
|
99 |
+
transition: background-color 0.3s;
|
100 |
+
margin-top: 10px;
|
101 |
+
}
|
102 |
+
|
103 |
+
button:hover {
|
104 |
+
background-color: #1a3d56;
|
105 |
+
}
|
106 |
+
|
107 |
+
/* Sezione Risultati */
|
108 |
+
.results {
|
109 |
+
padding: 25px;
|
110 |
+
background-color: #fff;
|
111 |
+
margin: 30px auto;
|
112 |
+
border-radius: 10px;
|
113 |
+
box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1);
|
114 |
+
max-width: 1000px;
|
115 |
+
overflow-y: auto;
|
116 |
+
max-height: 80vh;
|
117 |
+
}
|
118 |
+
|
119 |
+
/* Tabelle */
|
120 |
+
table {
|
121 |
+
width: 100%;
|
122 |
+
border-collapse: collapse;
|
123 |
+
margin-top: 25px;
|
124 |
+
}
|
125 |
+
|
126 |
+
th {
|
127 |
+
background-color: #2a4d6f;
|
128 |
+
color: #fff;
|
129 |
+
padding: 12px;
|
130 |
+
text-align: left;
|
131 |
+
}
|
132 |
+
|
133 |
+
td {
|
134 |
+
padding: 12px;
|
135 |
+
border-bottom: 1px solid #ddd;
|
136 |
+
background-color: #f8f9fa;
|
137 |
+
}
|
138 |
+
|
139 |
+
table tr:hover {
|
140 |
+
background-color: #f1f1f1;
|
141 |
+
}
|
142 |
+
|
143 |
+
table th, table td {
|
144 |
+
font-size: 1.1em;
|
145 |
+
}
|
146 |
+
|
147 |
+
/* Contenitore del grafico */
|
148 |
+
#chart-container {
|
149 |
+
width: 90%;
|
150 |
+
max-width: 1000px;
|
151 |
+
height: 600px;
|
152 |
+
margin: 40px auto;
|
153 |
+
padding-bottom: 20px;
|
154 |
+
}
|
155 |
+
|
156 |
+
canvas {
|
157 |
+
width: 100% !important;
|
158 |
+
height: 100% !important;
|
159 |
+
display: block;
|
160 |
+
}
|
161 |
+
|
162 |
+
#logo {
|
163 |
+
display: block; /* Assicura che il logo sia trattato come un blocco */
|
164 |
+
margin: 0 auto; /* Lo centra orizzontalmente */
|
165 |
+
max-width: 200px;
|
166 |
+
height: auto;
|
167 |
+
cursor: pointer;
|
168 |
+
transition: transform 0.3s ease;
|
169 |
+
position: relative; /* Evita sovrapposizioni strane */
|
170 |
+
z-index: 10; /* Assicura che sia sopra altri elementi */
|
171 |
+
}
|
172 |
+
|
173 |
+
/* Barra di caricamento */
|
174 |
+
#progress-container {
|
175 |
+
width: 100%;
|
176 |
+
background-color: #f3f3f3;
|
177 |
+
border-radius: 25px;
|
178 |
+
overflow: hidden;
|
179 |
+
margin: 20px 0;
|
180 |
+
}
|
181 |
+
|
182 |
+
#progress-bar {
|
183 |
+
height: 20px;
|
184 |
+
width: 0;
|
185 |
+
background-color: #4caf50;
|
186 |
+
text-align: center;
|
187 |
+
line-height: 20px;
|
188 |
+
color: white;
|
189 |
+
}
|
190 |
+
|
191 |
+
/* Footer */
|
192 |
+
footer {
|
193 |
+
background-color: #2a4d6f;
|
194 |
+
color: #fff;
|
195 |
+
text-align: center;
|
196 |
+
padding: 15px;
|
197 |
+
width: 100%;
|
198 |
+
font-size: 1.1em;
|
199 |
+
margin-top: auto; /* Il footer si posiziona in fondo alla pagina */
|
200 |
+
}
|
static/js/script.js
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
document.addEventListener("DOMContentLoaded", function () {
|
2 |
+
// Interazione con il logo
|
3 |
+
const logoLink = document.getElementById("logo-link");
|
4 |
+
if (logoLink) {
|
5 |
+
logoLink.addEventListener("click", function () {
|
6 |
+
const logo = document.getElementById("logo");
|
7 |
+
logo.style.transform = "scale(1.5)";
|
8 |
+
setTimeout(() => {
|
9 |
+
logo.style.transform = "scale(1)";
|
10 |
+
}, 500);
|
11 |
+
});
|
12 |
+
}
|
13 |
+
|
14 |
+
// Barra di avanzamento durante l'analisi
|
15 |
+
function startProgress() {
|
16 |
+
const progressBar = document.getElementById("progress-bar");
|
17 |
+
const progressContainer = document.getElementById("progress-container");
|
18 |
+
if (progressBar && progressContainer) {
|
19 |
+
progressContainer.style.display = "block";
|
20 |
+
let width = 0;
|
21 |
+
const interval = setInterval(() => {
|
22 |
+
if (width >= 100) {
|
23 |
+
clearInterval(interval);
|
24 |
+
} else {
|
25 |
+
width++;
|
26 |
+
progressBar.style.width = width + "%";
|
27 |
+
progressBar.textContent = width + "%";
|
28 |
+
}
|
29 |
+
}, 100);
|
30 |
+
}
|
31 |
+
}
|
32 |
+
|
33 |
+
// Inizializza il form per avviare la progress bar
|
34 |
+
const analysisForm = document.getElementById("analysisForm");
|
35 |
+
if (analysisForm) {
|
36 |
+
analysisForm.addEventListener("submit", function () {
|
37 |
+
startProgress();
|
38 |
+
});
|
39 |
+
}
|
40 |
+
|
41 |
+
// Configurazione e gestione dei grafici di SimilaritΓ , Token Overlap e OUI
|
42 |
+
const chartCanvas = document.getElementById("similarityChart");
|
43 |
+
if (chartCanvas) {
|
44 |
+
const ctx = chartCanvas.getContext("2d");
|
45 |
+
new Chart(ctx, {
|
46 |
+
type: "bar",
|
47 |
+
data: {
|
48 |
+
labels: {{ results | map(attribute='title') | list | safe }},
|
49 |
+
datasets: [
|
50 |
+
{
|
51 |
+
label: "Semantic Similarity (%)",
|
52 |
+
data: {{ results | map(attribute='similarity') | list | safe }},
|
53 |
+
backgroundColor: "rgba(54, 162, 235, 0.7)",
|
54 |
+
borderColor: "rgba(54, 162, 235, 1)",
|
55 |
+
borderWidth: 1
|
56 |
+
},
|
57 |
+
{
|
58 |
+
label: "Token Overlap (%)",
|
59 |
+
data: {{ results | map(attribute='token_overlap') | list | safe }},
|
60 |
+
backgroundColor: "rgba(255, 159, 64, 0.7)",
|
61 |
+
borderColor: "rgba(255, 159, 64, 1)",
|
62 |
+
borderWidth: 1
|
63 |
+
},
|
64 |
+
{
|
65 |
+
label: "OUI (%)",
|
66 |
+
data: {{ results | map(attribute='oui') | list | safe }},
|
67 |
+
backgroundColor: "rgba(153, 102, 255, 0.7)",
|
68 |
+
borderColor: "rgba(153, 102, 255, 1)",
|
69 |
+
borderWidth: 1
|
70 |
+
}
|
71 |
+
]
|
72 |
+
},
|
73 |
+
options: {
|
74 |
+
responsive: true,
|
75 |
+
maintainAspectRatio: false,
|
76 |
+
plugins: {
|
77 |
+
legend: { position: "top" },
|
78 |
+
tooltip: { mode: "index", intersect: false }
|
79 |
+
},
|
80 |
+
scales: {
|
81 |
+
y: { beginAtZero: true },
|
82 |
+
x: {
|
83 |
+
ticks: { autoSkip: false, maxRotation: 45, minRotation: 45 }
|
84 |
+
}
|
85 |
+
}
|
86 |
+
}
|
87 |
+
});
|
88 |
+
}
|
89 |
+
|
90 |
+
// Gestione del caricamento del file
|
91 |
+
const fileInput = document.getElementById("pdf_file");
|
92 |
+
if (fileInput) {
|
93 |
+
fileInput.addEventListener("change", function () {
|
94 |
+
const fileLabel = document.querySelector('label[for="pdf_file"]');
|
95 |
+
if (fileInput.files.length > 0 && fileLabel) {
|
96 |
+
fileLabel.textContent = `File selected: ${fileInput.files[0].name}`;
|
97 |
+
}
|
98 |
+
});
|
99 |
+
}
|
100 |
+
|
101 |
+
// Miglioramento dell'usabilitΓ dei messaggi di errore
|
102 |
+
const flashMessages = document.querySelectorAll(".error");
|
103 |
+
if (flashMessages.length > 0) {
|
104 |
+
setTimeout(() => {
|
105 |
+
flashMessages.forEach(message => message.remove());
|
106 |
+
}, 5000);
|
107 |
+
}
|
108 |
+
|
109 |
+
// Permettere la scelta tra analisi locale e PubMed
|
110 |
+
const analysisType = document.getElementById("analysis_type");
|
111 |
+
if (analysisType) {
|
112 |
+
analysisType.addEventListener("change", function () {
|
113 |
+
document.getElementById("pubmed-options").style.display =
|
114 |
+
this.value === "pubmed" ? "block" : "none";
|
115 |
+
document.getElementById("local-options").style.display =
|
116 |
+
this.value === "local" ? "block" : "none";
|
117 |
+
});
|
118 |
+
}
|
119 |
+
});
|
templates/NORUS.html
ADDED
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8">
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6 |
+
<title>NORUS Tool</title>
|
7 |
+
<link rel="stylesheet" href="{{ url_for('static', filename='css/style.css') }}">
|
8 |
+
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
9 |
+
<script src="{{ url_for('static', filename='js/script.js') }}"></script>
|
10 |
+
</head>
|
11 |
+
<body>
|
12 |
+
<header>
|
13 |
+
<div style="text-align: center; margin-top: 20px;">
|
14 |
+
<a href="#" id="logo-link">
|
15 |
+
<img src="{{ url_for('static', filename='images/NORUS.PNG') }}" alt="NORUS Logo" id="logo" style="max-width: 200px; height: auto;">
|
16 |
+
</a>
|
17 |
+
</div>
|
18 |
+
<h1>NORUS Tool</h1>
|
19 |
+
<p>Analyze your PDF and discover originality and similarity</p>
|
20 |
+
</header>
|
21 |
+
<main>
|
22 |
+
<form action="/validate" method="POST" enctype="multipart/form-data" onsubmit="startProgress()">
|
23 |
+
<label for="analysis_type">Choose Analysis Type:</label>
|
24 |
+
<select name="analysis_type" id="analysis_type" required>
|
25 |
+
<option value="local">Local Directory</option>
|
26 |
+
<option value="pubmed">PubMed Search</option>
|
27 |
+
</select>
|
28 |
+
|
29 |
+
<div id="pubmed-options" style="display: none;">
|
30 |
+
<label for="query">PubMed Query:</label>
|
31 |
+
<input type="text" name="query" id="query">
|
32 |
+
|
33 |
+
<label for="year_start">Start Year:</label>
|
34 |
+
<input type="number" name="year_start" id="year_start" min="1900" max="2025" value="2000">
|
35 |
+
|
36 |
+
<label for="year_end">End Year:</label>
|
37 |
+
<input type="number" name="year_end" id="year_end" min="1900" max="2025" value="2025">
|
38 |
+
|
39 |
+
<label for="num_articles">Number of Articles:</label>
|
40 |
+
<input type="number" name="num_articles" id="num_articles" min="1" value="10">
|
41 |
+
</div>
|
42 |
+
|
43 |
+
<div id="local-options" style="display: none;">
|
44 |
+
<label for="local_directory">Select Local Directory:</label>
|
45 |
+
<input type="text" name="local_directory" id="local_directory" placeholder="Enter directory path">
|
46 |
+
</div>
|
47 |
+
|
48 |
+
<label for="pdf_file">Upload PDF:</label>
|
49 |
+
<input type="file" name="pdf_file" id="pdf_file" required>
|
50 |
+
|
51 |
+
<button type="submit">Analyze</button>
|
52 |
+
</form>
|
53 |
+
|
54 |
+
<div id="progress-container" style="display: none;">
|
55 |
+
<div id="progress-bar">0%</div>
|
56 |
+
</div>
|
57 |
+
|
58 |
+
{% if results %}
|
59 |
+
<section>
|
60 |
+
<h2>Analysis Results</h2>
|
61 |
+
<table>
|
62 |
+
<thead>
|
63 |
+
<tr>
|
64 |
+
<th>Title</th>
|
65 |
+
<th>Semantic Similarity (%)</th>
|
66 |
+
<th>Token Overlap (%)</th>
|
67 |
+
<th>OUI (Originality & Uniqueness Index)</th>
|
68 |
+
</tr>
|
69 |
+
</thead>
|
70 |
+
<tbody>
|
71 |
+
{% for result in results %}
|
72 |
+
<tr>
|
73 |
+
<td style="max-width: 400px; word-wrap: break-word;">{{ result.title }}</td>
|
74 |
+
<td>{{ "%.2f"|format(result.similarity) }}</td>
|
75 |
+
<td>{{ "%.2f"|format(result.token_overlap) }}</td>
|
76 |
+
<td>{{ "%.2f"|format(result.oui * 100) }}</td>
|
77 |
+
</tr>
|
78 |
+
{% endfor %}
|
79 |
+
</tbody>
|
80 |
+
</table>
|
81 |
+
|
82 |
+
<div id="chart-container" style="margin-top: 50px;">
|
83 |
+
<canvas id="similarityChart"></canvas>
|
84 |
+
</div>
|
85 |
+
|
86 |
+
<script>
|
87 |
+
const labels = {{ results | map(attribute='title') | list | safe }};
|
88 |
+
const semanticData = {{ results | map(attribute='similarity') | list | safe }};
|
89 |
+
const tokenData = {{ results | map(attribute='token_overlap') | list | safe }};
|
90 |
+
const ouiData = {{ results | map(attribute='oui') | list | safe }}.map(x => x * 100);
|
91 |
+
|
92 |
+
new Chart(document.getElementById('similarityChart'), {
|
93 |
+
type: 'bar',
|
94 |
+
data: {
|
95 |
+
labels: labels,
|
96 |
+
datasets: [
|
97 |
+
{
|
98 |
+
label: 'Semantic Similarity (%)',
|
99 |
+
data: semanticData,
|
100 |
+
backgroundColor: 'rgba(54, 162, 235, 0.7)',
|
101 |
+
borderColor: 'rgba(54, 162, 235, 1)',
|
102 |
+
borderWidth: 1
|
103 |
+
},
|
104 |
+
{
|
105 |
+
label: 'Token Overlap (%)',
|
106 |
+
data: tokenData,
|
107 |
+
backgroundColor: 'rgba(255, 159, 64, 0.7)',
|
108 |
+
borderColor: 'rgba(255, 159, 64, 1)',
|
109 |
+
borderWidth: 1
|
110 |
+
},
|
111 |
+
{
|
112 |
+
label: 'OUI (%)',
|
113 |
+
data: ouiData,
|
114 |
+
backgroundColor: 'rgba(153, 102, 255, 0.7)',
|
115 |
+
borderColor: 'rgba(153, 102, 255, 1)',
|
116 |
+
borderWidth: 1
|
117 |
+
}
|
118 |
+
]
|
119 |
+
},
|
120 |
+
options: {
|
121 |
+
responsive: true,
|
122 |
+
maintainAspectRatio: false,
|
123 |
+
plugins: {
|
124 |
+
legend: {
|
125 |
+
position: 'top',
|
126 |
+
},
|
127 |
+
tooltip: {
|
128 |
+
mode: 'index',
|
129 |
+
intersect: false
|
130 |
+
}
|
131 |
+
},
|
132 |
+
scales: {
|
133 |
+
y: { beginAtZero: true },
|
134 |
+
x: {
|
135 |
+
ticks: {
|
136 |
+
autoSkip: false,
|
137 |
+
maxRotation: 45,
|
138 |
+
minRotation: 45
|
139 |
+
}
|
140 |
+
}
|
141 |
+
}
|
142 |
+
}
|
143 |
+
});
|
144 |
+
</script>
|
145 |
+
</section>
|
146 |
+
{% endif %}
|
147 |
+
</main>
|
148 |
+
<footer>
|
149 |
+
<p>© 2025 NORUS Tool. All rights reserved.</p>
|
150 |
+
</footer>
|
151 |
+
|
152 |
+
<script>
|
153 |
+
document.addEventListener("DOMContentLoaded", function() {
|
154 |
+
const analysisType = document.getElementById("analysis_type");
|
155 |
+
const pubmedOptions = document.getElementById("pubmed-options");
|
156 |
+
const localOptions = document.getElementById("local-options");
|
157 |
+
|
158 |
+
function toggleOptions() {
|
159 |
+
if (analysisType.value === "pubmed") {
|
160 |
+
pubmedOptions.style.display = "block";
|
161 |
+
localOptions.style.display = "none";
|
162 |
+
} else {
|
163 |
+
pubmedOptions.style.display = "none";
|
164 |
+
localOptions.style.display = "block";
|
165 |
+
}
|
166 |
+
}
|
167 |
+
|
168 |
+
analysisType.addEventListener("change", toggleOptions);
|
169 |
+
toggleOptions();
|
170 |
+
});
|
171 |
+
</script>
|
172 |
+
</body>
|
173 |
+
</html>
|
templates/app.py
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import requests
|
3 |
+
import pdfplumber
|
4 |
+
import numpy as np
|
5 |
+
from flask import Flask, render_template, request, redirect, url_for, flash
|
6 |
+
from werkzeug.utils import secure_filename
|
7 |
+
from sentence_transformers import SentenceTransformer, util
|
8 |
+
import nltk
|
9 |
+
from nltk.stem import WordNetLemmatizer, PorterStemmer
|
10 |
+
from nltk.tokenize import word_tokenize
|
11 |
+
from nltk.corpus import stopwords
|
12 |
+
|
13 |
+
nltk.download("punkt")
|
14 |
+
nltk.download("wordnet")
|
15 |
+
nltk.download("stopwords")
|
16 |
+
|
17 |
+
lemmatizer = WordNetLemmatizer()
|
18 |
+
stemmer = PorterStemmer()
|
19 |
+
stop_words = set(stopwords.words("english"))
|
20 |
+
|
21 |
+
app = Flask(__name__)
|
22 |
+
app.config["UPLOAD_FOLDER"] = "uploads"
|
23 |
+
os.makedirs(app.config["UPLOAD_FOLDER"], exist_ok=True)
|
24 |
+
|
25 |
+
model = SentenceTransformer("allenai/scibert_scivocab_uncased")
|
26 |
+
|
27 |
+
def extract_pdf_text(pdf_path):
|
28 |
+
text = ""
|
29 |
+
try:
|
30 |
+
with pdfplumber.open(pdf_path) as pdf:
|
31 |
+
for page in pdf.pages:
|
32 |
+
text += page.extract_text() or " "
|
33 |
+
except Exception as e:
|
34 |
+
print(f"Errore estrazione testo: {e}")
|
35 |
+
return text.lower().strip()
|
36 |
+
|
37 |
+
def preprocess_text(text):
|
38 |
+
text = text.lower()
|
39 |
+
words = word_tokenize(text)
|
40 |
+
words = [stemmer.stem(lemmatizer.lemmatize(w)) for w in words if w.isalnum() and w not in stop_words]
|
41 |
+
return " ".join(words)
|
42 |
+
|
43 |
+
def calculate_token_overlap(text1, text2):
|
44 |
+
tokens1 = set(text1.split())
|
45 |
+
tokens2 = set(text2.split())
|
46 |
+
overlap = len(tokens1 & tokens2)
|
47 |
+
return round((overlap / max(len(tokens1), 1)) * 100, 2)
|
48 |
+
|
49 |
+
def calculate_oui(similarity, token_overlap, alpha=0.7, beta=0.3):
|
50 |
+
oui = alpha * (1 - similarity / 100) + beta * (1 - token_overlap / 100)
|
51 |
+
return round(max(0, min(oui * 100, 100)), 2)
|
52 |
+
|
53 |
+
def validate_document(pdf_path, comparison_sources, method="local", titles=None):
|
54 |
+
pdf_text = extract_pdf_text(pdf_path)
|
55 |
+
results = []
|
56 |
+
for i, doc in enumerate(comparison_sources):
|
57 |
+
doc_text = extract_pdf_text(doc) if method == "local" else doc
|
58 |
+
similarity = util.pytorch_cos_sim(
|
59 |
+
model.encode(pdf_text, convert_to_tensor=True),
|
60 |
+
model.encode(doc_text, convert_to_tensor=True)
|
61 |
+
).item() * 100
|
62 |
+
token_overlap = calculate_token_overlap(pdf_text, doc_text)
|
63 |
+
oui = calculate_oui(similarity, token_overlap)
|
64 |
+
title = titles[i] if titles and i < len(titles) else os.path.basename(doc) if method == "local" else "Unknown Title"
|
65 |
+
results.append({"title": title, "similarity": round(similarity, 2), "token_overlap": round(token_overlap, 2), "oui": round(oui, 2)})
|
66 |
+
return results
|
67 |
+
|
68 |
+
def fetch_pubmed_details(article_id):
|
69 |
+
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
|
70 |
+
params = {"db": "pubmed", "id": article_id, "retmode": "xml"}
|
71 |
+
try:
|
72 |
+
response = requests.get(base_url, params=params)
|
73 |
+
response.raise_for_status()
|
74 |
+
import xml.etree.ElementTree as ET
|
75 |
+
root = ET.fromstring(response.text)
|
76 |
+
title = root.find(".//ArticleTitle").text if root.find(".//ArticleTitle") is not None else "No Title"
|
77 |
+
abstract = root.find(".//AbstractText").text if root.find(".//AbstractText") is not None else "No Abstract"
|
78 |
+
keywords = root.findall(".//Keyword")
|
79 |
+
keyword_text = " ".join([kw.text for kw in keywords if kw.text]) if keywords else "No Keywords"
|
80 |
+
print(f"\nπ ARTICOLO RECUPERATO\nπ Titolo: {title}\nπ Abstract: {abstract[:500]}...\nπ Keywords: {keyword_text}\n")
|
81 |
+
return title, f"{abstract} {keyword_text}"
|
82 |
+
except requests.exceptions.RequestException as e:
|
83 |
+
print(f"Errore recupero abstract: {e}")
|
84 |
+
return "No Title", "No Abstract"
|
85 |
+
|
86 |
+
def fetch_pubmed(query, year_start, year_end, max_results=10):
|
87 |
+
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
|
88 |
+
params = {"db": "pubmed", "term": f"{query} AND ({year_start}[PDAT] : {year_end}[PDAT])", "retmax": max_results, "retmode": "json"}
|
89 |
+
try:
|
90 |
+
response = requests.get(base_url, params=params)
|
91 |
+
response.raise_for_status()
|
92 |
+
return response.json().get("esearchresult", {}).get("idlist", [])
|
93 |
+
except requests.exceptions.RequestException as e:
|
94 |
+
print(f"Errore recupero articoli PubMed: {e}")
|
95 |
+
|
96 |
+
return []
|
97 |
+
|
98 |
+
@app.route("/")
|
99 |
+
def index():
|
100 |
+
return render_template("NORUS.html")
|
101 |
+
|
102 |
+
@app.route("/validate", methods=["POST"])
|
103 |
+
def validate():
|
104 |
+
pdf_file = request.files.get("pdf_file")
|
105 |
+
analysis_type = request.form.get("analysis_type")
|
106 |
+
local_dir = request.form.get("local_directory", "").strip()
|
107 |
+
query = request.form.get("query", "").strip()
|
108 |
+
if not pdf_file:
|
109 |
+
flash("Carica un file PDF valido.", "error")
|
110 |
+
return redirect(url_for("index"))
|
111 |
+
filename = secure_filename(pdf_file.filename)
|
112 |
+
pdf_path = os.path.join(app.config["UPLOAD_FOLDER"], filename)
|
113 |
+
pdf_file.save(pdf_path)
|
114 |
+
results = []
|
115 |
+
if analysis_type == "local":
|
116 |
+
if not os.path.isdir(local_dir):
|
117 |
+
flash("Seleziona una directory valida.", "error")
|
118 |
+
return redirect(url_for("index"))
|
119 |
+
comparison_files = [os.path.join(local_dir, f) for f in os.listdir(local_dir) if f.endswith(".pdf")]
|
120 |
+
if not comparison_files:
|
121 |
+
flash("La directory non contiene PDF.", "error")
|
122 |
+
return redirect(url_for("index"))
|
123 |
+
results = validate_document(pdf_path, comparison_files, method="local")
|
124 |
+
elif analysis_type == "pubmed":
|
125 |
+
year_start = request.form.get("year_start", "2000")
|
126 |
+
year_end = request.form.get("year_end", "2025")
|
127 |
+
num_articles = int(request.form.get("num_articles", "10"))
|
128 |
+
pubmed_ids = fetch_pubmed(query, year_start, year_end, num_articles)
|
129 |
+
pubmed_results = [fetch_pubmed_details(article_id) for article_id in pubmed_ids]
|
130 |
+
results = validate_document(pdf_path, [result[1] for result in pubmed_results], method="pubmed", titles=[result[0] for result in pubmed_results])
|
131 |
+
return render_template("NORUS.html", results=results)
|
132 |
+
|
133 |
+
if __name__ == "__main__":
|
134 |
+
app.run(debug=True, port=7860)
|