Uploaded NORUS app files
Browse files- .DS_Store +0 -0
- Dockerfile +27 -0
- README.md +81 -8
- app.py +216 -0
- app_setup.sh +6 -0
- index.html +0 -19
- models/.DS_Store +0 -0
- models/__pycache__/similarity_model.cpython-313.pyc +0 -0
- models/similarity_model.py +32 -0
- requirements.txt +14 -0
- start_local.sh +8 -0
- static/.DS_Store +0 -0
- static/css/.DS_Store +0 -0
- static/css/style.css +221 -0
- static/js/.DS_Store +0 -0
- static/js/script.js +102 -0
- style.css +0 -28
- templates/.DS_Store +0 -0
- templates/NORUS.html +180 -0
- templates/app.py +134 -0
.DS_Store
ADDED
Binary file (10.2 kB). View file
|
|
Dockerfile
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9
|
2 |
+
|
3 |
+
# 1. Crea utente non-root (richiesto da Hugging Face)
|
4 |
+
RUN useradd -m -u 1000 user
|
5 |
+
USER user
|
6 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
7 |
+
|
8 |
+
# 2. Crea directory di lavoro
|
9 |
+
WORKDIR /app
|
10 |
+
|
11 |
+
# 3. Copia requirements e installa pacchetti
|
12 |
+
COPY --chown=user requirements.txt .
|
13 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
14 |
+
|
15 |
+
# 4. Copia script di setup per NLTK
|
16 |
+
COPY --chown=user app_setup.sh .
|
17 |
+
RUN chmod +x app_setup.sh && ./app_setup.sh
|
18 |
+
|
19 |
+
# 5. Copia tutto il resto dell'app
|
20 |
+
COPY --chown=user . .
|
21 |
+
|
22 |
+
# 6. Imposta variabile per NLTK
|
23 |
+
ENV NLTK_DATA="/home/user/nltk_data"
|
24 |
+
|
25 |
+
# 7. Avvia l'app
|
26 |
+
CMD ["python", "app.py"]
|
27 |
+
|
README.md
CHANGED
@@ -1,11 +1,84 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
-
sdk:
|
7 |
-
|
8 |
-
|
9 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: Norus Tool
|
3 |
+
emoji: ๐ฅ
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: purple
|
6 |
+
sdk: docker
|
7 |
+
app_file: app.py
|
8 |
+
pinned: true
|
9 |
---
|
10 |
+
# NORUS Tool ๐ง ๐
|
11 |
+
|
12 |
+
[](https://huggingface.co/spaces/mabil/norus-tool)
|
13 |
+
|
14 |
+
**NORUS** (Novelty and Originality Recognition Utility System) รจ uno strumento basato su intelligenza artificiale che consente l'analisi semantica di articoli scientifici in formato PDF, confrontandoli con articoli locali o pubblicati su PubMed. Il tool restituisce misure di **similaritร semantica**, **token overlap** e un indice composito chiamato **OUI (Originality & Uniqueness Index)**.
|
15 |
+
|
16 |
+
## ๐ Funzionalitร principali
|
17 |
+
|
18 |
+
- โ
Caricamento PDF da analizzare
|
19 |
+
- ๐ Confronto con PDF locali o articoli da PubMed
|
20 |
+
- ๐ค Estrazione di embedding semantici tramite SciBERT
|
21 |
+
- ๐ Calcolo di:
|
22 |
+
- Similaritร semantica (cosine similarity)
|
23 |
+
- Sovrapposizione testuale (token overlap)
|
24 |
+
- Indice OUI (originalitร e novitร )
|
25 |
+
- ๐ Visualizzazione interattiva dei risultati via Chart.js
|
26 |
+
|
27 |
+
## ๐งช OUI - Originality & Uniqueness Index
|
28 |
+
|
29 |
+
\`\`\`math
|
30 |
+
OUI = 1 - (ฮฑ ร semantic_similarity + ฮฒ ร token_overlap)
|
31 |
+
\`\`\`
|
32 |
+
|
33 |
+
- ฮฑ = 0.7 โ penalizza la somiglianza semantica
|
34 |
+
- ฮฒ = 0.3 โ penalizza la ripetizione letterale
|
35 |
+
- L'OUI misura **quanto un documento รจ originale**, sia nel contenuto che nella forma.
|
36 |
+
|
37 |
+
## ๐งฑ Architettura
|
38 |
+
|
39 |
+
- `Flask` come backend web
|
40 |
+
- `pdfplumber` per l'estrazione del testo dai PDF
|
41 |
+
- `nltk` per preprocessing linguistico
|
42 |
+
- `sentence-transformers` con modello `allenai/scibert_scivocab_uncased`
|
43 |
+
- `requests` per l'interfaccia con PubMed
|
44 |
+
|
45 |
+
## ๐ Struttura del progetto
|
46 |
+
|
47 |
+
```
|
48 |
+
.
|
49 |
+
โโโ app.py
|
50 |
+
โโโ Dockerfile
|
51 |
+
โโโ requirements.txt
|
52 |
+
โโโ static/
|
53 |
+
โโโ templates/
|
54 |
+
โโโ uploads/
|
55 |
+
โโโ README.md
|
56 |
+
```
|
57 |
+
|
58 |
+
## โถ๏ธ Esecuzione locale
|
59 |
+
|
60 |
+
Per eseguire localmente:
|
61 |
+
|
62 |
+
1. Assicurati di avere Python 3.9+
|
63 |
+
2. Installa le dipendenze:
|
64 |
+
|
65 |
+
\`\`\`bash
|
66 |
+
pip install -r requirements.txt
|
67 |
+
\`\`\`
|
68 |
+
|
69 |
+
3. Avvia l'app:
|
70 |
+
|
71 |
+
\`\`\`bash
|
72 |
+
python app.py
|
73 |
+
\`\`\`
|
74 |
+
|
75 |
+
Apri il browser su `http://localhost:7860`
|
76 |
+
|
77 |
+
## ๐ก Deploy su Hugging Face Spaces
|
78 |
+
|
79 |
+
Puoi caricare questo progetto come Space Docker-based su Hugging Face. Il `Dockerfile` รจ giร configurato.
|
80 |
+
|
81 |
+
---
|
82 |
+
---
|
83 |
+
๐ง Developed by Marina Bilotta โ Computational Chemistry & AI Research
|
84 |
|
|
app.py
ADDED
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import requests
|
3 |
+
import pdfplumber
|
4 |
+
from flask import Flask, render_template, request, redirect, url_for, flash, send_file
|
5 |
+
from werkzeug.utils import secure_filename
|
6 |
+
from sentence_transformers import SentenceTransformer, util
|
7 |
+
from transformers import AutoTokenizer
|
8 |
+
from fpdf import FPDF
|
9 |
+
from collections import Counter
|
10 |
+
from io import BytesIO
|
11 |
+
|
12 |
+
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
13 |
+
|
14 |
+
app = Flask(__name__)
|
15 |
+
app.secret_key = os.environ.get("SECRET_KEY", "NORUS_secretkey_05")
|
16 |
+
app.config["UPLOAD_FOLDER"] = "uploads"
|
17 |
+
os.makedirs(app.config["UPLOAD_FOLDER"], exist_ok=True)
|
18 |
+
|
19 |
+
model = SentenceTransformer("allenai/scibert_scivocab_uncased")
|
20 |
+
|
21 |
+
last_results = []
|
22 |
+
last_common_keywords = []
|
23 |
+
|
24 |
+
def extract_pdf_text(pdf_path):
|
25 |
+
text = ""
|
26 |
+
try:
|
27 |
+
with pdfplumber.open(pdf_path) as pdf:
|
28 |
+
for page in pdf.pages:
|
29 |
+
text += page.extract_text() or " "
|
30 |
+
except Exception as e:
|
31 |
+
print(f"Errore estrazione testo: {e}")
|
32 |
+
return text.lower().strip()
|
33 |
+
|
34 |
+
def preprocess_text(text):
|
35 |
+
tokens = tokenizer.tokenize(text.lower())
|
36 |
+
tokens = [token for token in tokens if len(token) > 3 and token.isalpha()]
|
37 |
+
return tokens
|
38 |
+
|
39 |
+
def calculate_token_overlap(text1, text2):
|
40 |
+
tokens1 = set(text1.split())
|
41 |
+
tokens2 = set(text2.split())
|
42 |
+
overlap = len(tokens1 & tokens2)
|
43 |
+
return round((overlap / max(len(tokens1), 1)) * 100, 2)
|
44 |
+
|
45 |
+
def calculate_oui(similarity, token_overlap, alpha=0.7, beta=0.3):
|
46 |
+
oui = alpha * (1 - similarity / 100) + beta * (1 - token_overlap / 100)
|
47 |
+
result = round(oui * 100, 2)
|
48 |
+
return 0.0 if result == -0.0 else result
|
49 |
+
|
50 |
+
def validate_document(pdf_path, comparison_sources, method="local", titles=None):
|
51 |
+
pdf_text = extract_pdf_text(pdf_path)
|
52 |
+
pdf_tokens = preprocess_text(pdf_text)
|
53 |
+
results = []
|
54 |
+
all_keywords = []
|
55 |
+
|
56 |
+
for i, doc in enumerate(comparison_sources):
|
57 |
+
doc_text = extract_pdf_text(doc) if method == "local" else doc
|
58 |
+
doc_tokens = preprocess_text(doc_text)
|
59 |
+
|
60 |
+
similarity = util.pytorch_cos_sim(
|
61 |
+
model.encode(pdf_text, convert_to_tensor=True),
|
62 |
+
model.encode(doc_text, convert_to_tensor=True)
|
63 |
+
).item() * 100
|
64 |
+
|
65 |
+
token_overlap = calculate_token_overlap(" ".join(pdf_tokens), " ".join(doc_tokens))
|
66 |
+
oui = calculate_oui(similarity, token_overlap)
|
67 |
+
title = titles[i] if titles and i < len(titles) else os.path.basename(doc) if method == "local" else "Unknown Title"
|
68 |
+
|
69 |
+
common_keywords = list(set(pdf_tokens) & set(doc_tokens))[:5]
|
70 |
+
all_keywords.extend(common_keywords)
|
71 |
+
|
72 |
+
results.append({
|
73 |
+
"title": title,
|
74 |
+
"similarity": round(similarity, 2),
|
75 |
+
"token_overlap": round(token_overlap, 2),
|
76 |
+
"oui": round(oui, 2)
|
77 |
+
})
|
78 |
+
|
79 |
+
global last_results, last_common_keywords
|
80 |
+
last_results = results
|
81 |
+
last_common_keywords = Counter(all_keywords).most_common(10)
|
82 |
+
return results
|
83 |
+
|
84 |
+
def fetch_pubmed_details(article_id):
|
85 |
+
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
|
86 |
+
params = {"db": "pubmed", "id": article_id, "retmode": "xml"}
|
87 |
+
try:
|
88 |
+
response = requests.get(base_url, params=params)
|
89 |
+
response.raise_for_status()
|
90 |
+
import xml.etree.ElementTree as ET
|
91 |
+
root = ET.fromstring(response.text)
|
92 |
+
title = root.find(".//ArticleTitle").text if root.find(".//ArticleTitle") is not None else "No Title"
|
93 |
+
abstract = root.find(".//AbstractText").text if root.find(".//AbstractText") is not None else "No Abstract"
|
94 |
+
keywords = root.findall(".//Keyword")
|
95 |
+
keyword_text = " ".join([kw.text for kw in keywords if kw.text]) if keywords else ""
|
96 |
+
return title, f"{abstract} {keyword_text}"
|
97 |
+
except Exception as e:
|
98 |
+
print(f"Errore recupero abstract: {e}")
|
99 |
+
return "No Title", "No Abstract"
|
100 |
+
|
101 |
+
def fetch_pubmed(query, year_start, year_end, max_results=10):
|
102 |
+
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
|
103 |
+
params = {
|
104 |
+
"db": "pubmed",
|
105 |
+
"term": f"{query} AND ({year_start}[PDAT] : {year_end}[PDAT])",
|
106 |
+
"retmax": max_results,
|
107 |
+
"retmode": "json",
|
108 |
+
"sort": "relevance" # <-- Qui abbiamo ordinato per rilevanza
|
109 |
+
}
|
110 |
+
try:
|
111 |
+
response = requests.get(base_url, params=params)
|
112 |
+
response.raise_for_status()
|
113 |
+
id_list = response.json().get("esearchresult", {}).get("idlist", [])
|
114 |
+
return id_list
|
115 |
+
except Exception as e:
|
116 |
+
print(f"Errore fetch PubMed: {e}")
|
117 |
+
return []
|
118 |
+
|
119 |
+
@app.route("/")
|
120 |
+
def index():
|
121 |
+
return render_template("NORUS.html")
|
122 |
+
|
123 |
+
@app.route("/validate", methods=["POST"])
|
124 |
+
def validate():
|
125 |
+
pdf_file = request.files.get("pdf_file")
|
126 |
+
analysis_type = request.form.get("analysis_type")
|
127 |
+
query = request.form.get("query", "").strip()
|
128 |
+
|
129 |
+
if not pdf_file:
|
130 |
+
flash("Carica un file PDF valido.", "error")
|
131 |
+
return redirect(url_for("index"))
|
132 |
+
|
133 |
+
filename = secure_filename(pdf_file.filename)
|
134 |
+
pdf_path = os.path.join(app.config["UPLOAD_FOLDER"], filename)
|
135 |
+
pdf_file.save(pdf_path)
|
136 |
+
|
137 |
+
if analysis_type == "local":
|
138 |
+
comparison_files = request.files.getlist("comparison_files")
|
139 |
+
saved_paths = []
|
140 |
+
for file in comparison_files:
|
141 |
+
if file and file.filename.endswith(".pdf"):
|
142 |
+
fname = secure_filename(file.filename)
|
143 |
+
path = os.path.join(app.config["UPLOAD_FOLDER"], fname)
|
144 |
+
file.save(path)
|
145 |
+
saved_paths.append(path)
|
146 |
+
if not saved_paths:
|
147 |
+
flash("Nessun file di confronto caricato.", "error")
|
148 |
+
return redirect(url_for("index"))
|
149 |
+
results = validate_document(pdf_path, saved_paths, method="local")
|
150 |
+
else:
|
151 |
+
year_start = request.form.get("year_start", "2000")
|
152 |
+
year_end = request.form.get("year_end", "2025")
|
153 |
+
num_articles = int(request.form.get("num_articles", "10"))
|
154 |
+
pubmed_ids = fetch_pubmed(query, year_start, year_end, num_articles)
|
155 |
+
|
156 |
+
if not pubmed_ids:
|
157 |
+
flash("Nessun articolo trovato su PubMed per questa ricerca.", "error")
|
158 |
+
return redirect(url_for("index"))
|
159 |
+
|
160 |
+
pubmed_results = [fetch_pubmed_details(id_) for id_ in pubmed_ids]
|
161 |
+
pubmed_texts = [r[1] for r in pubmed_results]
|
162 |
+
pubmed_titles = [r[0] for r in pubmed_results]
|
163 |
+
|
164 |
+
results = validate_document(pdf_path, pubmed_texts, method="pubmed", titles=pubmed_titles)
|
165 |
+
|
166 |
+
return render_template("NORUS.html", results=results, keywords=last_common_keywords)
|
167 |
+
|
168 |
+
@app.route("/download_report", methods=["POST"])
|
169 |
+
def download_report():
|
170 |
+
if not last_results:
|
171 |
+
flash("Nessun risultato da esportare.", "error")
|
172 |
+
return redirect(url_for("index"))
|
173 |
+
|
174 |
+
pdf = FPDF()
|
175 |
+
pdf.add_page()
|
176 |
+
pdf.set_font("Arial", "B", 16)
|
177 |
+
pdf.cell(0, 10, "NORUS Tool - Report Analisi", ln=True, align="C")
|
178 |
+
pdf.ln(10)
|
179 |
+
pdf.set_font('Arial', '', 12)
|
180 |
+
pdf.multi_cell(0, 10, "Indice OUI = alpha(1 - sim/100) + beta(1 - overlap/100), con alpha = 0.7 e beta = 0.3.\nValori piรน bassi di OUI indicano maggiore similaritร semantica e testuale.")
|
181 |
+
pdf.ln(5)
|
182 |
+
pdf.set_font("Arial", "B", 12)
|
183 |
+
pdf.cell(90, 10, "Titolo", 1)
|
184 |
+
pdf.cell(30, 10, "Sim %", 1)
|
185 |
+
pdf.cell(30, 10, "Overlap %", 1)
|
186 |
+
pdf.cell(30, 10, "OUI", 1)
|
187 |
+
pdf.ln()
|
188 |
+
|
189 |
+
pdf.set_font("Arial", "", 11)
|
190 |
+
for res in last_results:
|
191 |
+
title = res["title"][:40] + "..." if len(res["title"]) > 43 else res["title"]
|
192 |
+
pdf.cell(90, 10, title, 1)
|
193 |
+
pdf.cell(30, 10, str(res["similarity"]), 1)
|
194 |
+
pdf.cell(30, 10, str(res["token_overlap"]), 1)
|
195 |
+
pdf.cell(30, 10, str(res["oui"]), 1)
|
196 |
+
pdf.ln()
|
197 |
+
|
198 |
+
if last_common_keywords:
|
199 |
+
pdf.ln(6)
|
200 |
+
pdf.set_font("Arial", "B", 12)
|
201 |
+
pdf.cell(0, 10, "Parole chiave comuni:", ln=True)
|
202 |
+
pdf.set_font("Arial", "", 11)
|
203 |
+
for kw, count in last_common_keywords:
|
204 |
+
pdf.cell(0, 10, f"- {kw} ({count})", ln=True)
|
205 |
+
|
206 |
+
pdf.set_y(-20)
|
207 |
+
pdf.set_font("Arial", "I", 9)
|
208 |
+
pdf.cell(0, 10, "ยฉ 2025 NORUS Tool", 0, 0, "C")
|
209 |
+
|
210 |
+
output_path = os.path.join(app.config["UPLOAD_FOLDER"], "NORUS_Report.pdf")
|
211 |
+
pdf.output(output_path, 'F')
|
212 |
+
|
213 |
+
return send_file(output_path, as_attachment=True)
|
214 |
+
|
215 |
+
if __name__ == "__main__":
|
216 |
+
app.run(debug=True, host="0.0.0.0", port=7860)
|
app_setup.sh
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
echo ">>> Setup NLTK resources..."
|
4 |
+
mkdir -p /home/user/nltk_data
|
5 |
+
python3 -m nltk.downloader -d /home/user/nltk_data punkt stopwords wordnet
|
6 |
+
|
index.html
DELETED
@@ -1,19 +0,0 @@
|
|
1 |
-
<!doctype html>
|
2 |
-
<html>
|
3 |
-
<head>
|
4 |
-
<meta charset="utf-8" />
|
5 |
-
<meta name="viewport" content="width=device-width" />
|
6 |
-
<title>My static Space</title>
|
7 |
-
<link rel="stylesheet" href="style.css" />
|
8 |
-
</head>
|
9 |
-
<body>
|
10 |
-
<div class="card">
|
11 |
-
<h1>Welcome to your static Space!</h1>
|
12 |
-
<p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
|
13 |
-
<p>
|
14 |
-
Also don't forget to check the
|
15 |
-
<a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
|
16 |
-
</p>
|
17 |
-
</div>
|
18 |
-
</body>
|
19 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
models/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
models/__pycache__/similarity_model.cpython-313.pyc
ADDED
Binary file (762 Bytes). View file
|
|
models/similarity_model.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
2 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
3 |
+
|
4 |
+
def compute_similarity(text1, text2):
|
5 |
+
"""
|
6 |
+
Calcola la similaritร del coseno tra due testi usando TF-IDF.
|
7 |
+
|
8 |
+
Parametri:
|
9 |
+
text1 (str): Primo testo.
|
10 |
+
text2 (str): Secondo testo.
|
11 |
+
|
12 |
+
Ritorna:
|
13 |
+
float: Valore di similaritร (0 a 1).
|
14 |
+
"""
|
15 |
+
try:
|
16 |
+
# Verifica che i testi non siano vuoti
|
17 |
+
if not text1.strip() or not text2.strip():
|
18 |
+
raise ValueError("Uno o entrambi i testi sono vuoti.")
|
19 |
+
|
20 |
+
# Vettorizzazione con TF-IDF
|
21 |
+
vectorizer = TfidfVectorizer(stop_words='english')
|
22 |
+
tfidf_matrix = vectorizer.fit_transform([text1, text2])
|
23 |
+
|
24 |
+
# Calcolo della similaritร del coseno
|
25 |
+
similarity_matrix = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
|
26 |
+
|
27 |
+
return similarity_matrix[0][0] # Ritorna il valore della similaritร
|
28 |
+
|
29 |
+
except Exception as e:
|
30 |
+
print(f"Errore durante il calcolo della similaritร : {e}")
|
31 |
+
return None
|
32 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fpdf
|
2 |
+
flask
|
3 |
+
pdfplumber
|
4 |
+
nltk
|
5 |
+
sentence-transformers
|
6 |
+
scikit-learn
|
7 |
+
pandas
|
8 |
+
reportlab
|
9 |
+
matplotlib
|
10 |
+
requests
|
11 |
+
keybert
|
12 |
+
torch
|
13 |
+
transformers
|
14 |
+
spacy
|
start_local.sh
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
echo "โ๏ธ Avvio dell'ambiente NORUS..."
|
3 |
+
python3 -m venv venv
|
4 |
+
source venv/bin/activate
|
5 |
+
pip install --upgrade pip
|
6 |
+
pip install -r requirements.txt
|
7 |
+
echo "โ
Ambiente pronto. Avvio del server Flask..."
|
8 |
+
python app.py
|
static/.DS_Store
ADDED
Binary file (8.2 kB). View file
|
|
static/css/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
static/css/style.css
ADDED
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/* Reset base */
|
2 |
+
html, body {
|
3 |
+
height: 100%;
|
4 |
+
margin: 0;
|
5 |
+
padding: 0;
|
6 |
+
overflow-y: auto;
|
7 |
+
}
|
8 |
+
|
9 |
+
/* Corpo */
|
10 |
+
body {
|
11 |
+
font-family: Arial, sans-serif;
|
12 |
+
background-color: #f8f8f8;
|
13 |
+
color: #333;
|
14 |
+
display: flex;
|
15 |
+
flex-direction: column;
|
16 |
+
min-height: 100vh;
|
17 |
+
}
|
18 |
+
|
19 |
+
/* Header */
|
20 |
+
header {
|
21 |
+
background-color: rgba(42, 77, 111, 0.9);
|
22 |
+
color: #fff;
|
23 |
+
padding: 20px;
|
24 |
+
text-align: center;
|
25 |
+
}
|
26 |
+
|
27 |
+
header h1 {
|
28 |
+
margin-bottom: 10px;
|
29 |
+
font-size: 2.2em;
|
30 |
+
}
|
31 |
+
|
32 |
+
header p {
|
33 |
+
font-size: 1.2em;
|
34 |
+
}
|
35 |
+
|
36 |
+
/* Logo */
|
37 |
+
#logo {
|
38 |
+
display: block;
|
39 |
+
margin: 0 auto;
|
40 |
+
max-width: 200px;
|
41 |
+
height: auto;
|
42 |
+
cursor: pointer;
|
43 |
+
transition: transform 0.3s ease;
|
44 |
+
}
|
45 |
+
|
46 |
+
#logo:hover {
|
47 |
+
transform: scale(1.2);
|
48 |
+
}
|
49 |
+
|
50 |
+
/* Form principale */
|
51 |
+
form {
|
52 |
+
margin: 20px auto;
|
53 |
+
width: 90%;
|
54 |
+
max-width: 800px;
|
55 |
+
padding: 25px;
|
56 |
+
background-color: #fff;
|
57 |
+
border-radius: 12px;
|
58 |
+
box-shadow: 0 5px 15px rgba(0, 0, 0, 0.1);
|
59 |
+
}
|
60 |
+
|
61 |
+
label {
|
62 |
+
display: block;
|
63 |
+
font-size: 1.05em;
|
64 |
+
margin: 12px 0 5px;
|
65 |
+
font-weight: bold;
|
66 |
+
color: #2a4d6f;
|
67 |
+
}
|
68 |
+
|
69 |
+
input[type="file"],
|
70 |
+
input[type="text"],
|
71 |
+
input[type="number"],
|
72 |
+
select {
|
73 |
+
width: 100%;
|
74 |
+
padding: 10px;
|
75 |
+
margin-bottom: 15px;
|
76 |
+
border: 1px solid #ccc;
|
77 |
+
border-radius: 6px;
|
78 |
+
font-size: 1em;
|
79 |
+
box-sizing: border-box;
|
80 |
+
}
|
81 |
+
|
82 |
+
/* Input file */
|
83 |
+
input[type="file"]::file-selector-button {
|
84 |
+
padding: 6px 12px;
|
85 |
+
margin-right: 10px;
|
86 |
+
background-color: #2a4d6f;
|
87 |
+
color: white;
|
88 |
+
border: none;
|
89 |
+
border-radius: 5px;
|
90 |
+
cursor: pointer;
|
91 |
+
transition: background-color 0.3s;
|
92 |
+
}
|
93 |
+
|
94 |
+
input[type="file"]::file-selector-button:hover {
|
95 |
+
background-color: #1a3d56;
|
96 |
+
}
|
97 |
+
|
98 |
+
/* Pulsanti */
|
99 |
+
button {
|
100 |
+
width: 100%;
|
101 |
+
background-color: #2a4d6f;
|
102 |
+
color: #fff;
|
103 |
+
padding: 12px;
|
104 |
+
border: none;
|
105 |
+
border-radius: 6px;
|
106 |
+
font-size: 1.1em;
|
107 |
+
cursor: pointer;
|
108 |
+
transition: background-color 0.3s, transform 0.2s;
|
109 |
+
margin-top: 10px;
|
110 |
+
}
|
111 |
+
|
112 |
+
button:hover {
|
113 |
+
background-color: #1a3d56;
|
114 |
+
transform: scale(1.02);
|
115 |
+
}
|
116 |
+
|
117 |
+
/* Risultati */
|
118 |
+
.results {
|
119 |
+
padding: 25px;
|
120 |
+
background-color: #fff;
|
121 |
+
margin: 30px auto;
|
122 |
+
border-radius: 12px;
|
123 |
+
box-shadow: 0 5px 15px rgba(0, 0, 0, 0.1);
|
124 |
+
max-width: 1000px;
|
125 |
+
overflow-x: auto;
|
126 |
+
}
|
127 |
+
|
128 |
+
/* Tabelle */
|
129 |
+
table {
|
130 |
+
width: 100%;
|
131 |
+
border-collapse: collapse;
|
132 |
+
margin-top: 25px;
|
133 |
+
}
|
134 |
+
|
135 |
+
th {
|
136 |
+
background-color: #2a4d6f;
|
137 |
+
color: #fff;
|
138 |
+
padding: 12px;
|
139 |
+
text-align: left;
|
140 |
+
}
|
141 |
+
|
142 |
+
td {
|
143 |
+
padding: 12px;
|
144 |
+
border-bottom: 1px solid #ddd;
|
145 |
+
background-color: #f9f9f9;
|
146 |
+
}
|
147 |
+
|
148 |
+
table tr:hover {
|
149 |
+
background-color: #eef3f7;
|
150 |
+
}
|
151 |
+
|
152 |
+
table th, table td {
|
153 |
+
font-size: 1em;
|
154 |
+
word-wrap: break-word;
|
155 |
+
}
|
156 |
+
|
157 |
+
/* Grafico */
|
158 |
+
#chart-container {
|
159 |
+
width: 100%;
|
160 |
+
max-width: 1000px;
|
161 |
+
height: 500px;
|
162 |
+
margin: 40px auto;
|
163 |
+
}
|
164 |
+
|
165 |
+
canvas {
|
166 |
+
width: 100% !important;
|
167 |
+
height: 100% !important;
|
168 |
+
display: block;
|
169 |
+
}
|
170 |
+
|
171 |
+
/* Barra di caricamento */
|
172 |
+
#progress-container {
|
173 |
+
width: 100%;
|
174 |
+
background-color: #e0e0e0;
|
175 |
+
border-radius: 20px;
|
176 |
+
overflow: hidden;
|
177 |
+
margin-top: 20px;
|
178 |
+
}
|
179 |
+
|
180 |
+
#progress-bar {
|
181 |
+
height: 20px;
|
182 |
+
width: 0;
|
183 |
+
background: linear-gradient(90deg, #4caf50 0%, #8bc34a 100%);
|
184 |
+
text-align: center;
|
185 |
+
line-height: 20px;
|
186 |
+
color: white;
|
187 |
+
font-weight: bold;
|
188 |
+
transition: width 0.4s ease;
|
189 |
+
}
|
190 |
+
|
191 |
+
/* Quando al 100%, barra diventa blu */
|
192 |
+
#progress-bar.complete {
|
193 |
+
background: linear-gradient(90deg, #2196f3 0%, #21cbf3 100%);
|
194 |
+
}
|
195 |
+
|
196 |
+
/* Footer */
|
197 |
+
footer {
|
198 |
+
background-color: #2a4d6f;
|
199 |
+
color: #fff;
|
200 |
+
text-align: center;
|
201 |
+
padding: 15px;
|
202 |
+
width: 100%;
|
203 |
+
font-size: 1em;
|
204 |
+
margin-top: auto;
|
205 |
+
}
|
206 |
+
|
207 |
+
/* Responsive layout */
|
208 |
+
@media screen and (max-width: 600px) {
|
209 |
+
form, .results {
|
210 |
+
width: 95%;
|
211 |
+
padding: 15px;
|
212 |
+
}
|
213 |
+
|
214 |
+
header h1 {
|
215 |
+
font-size: 1.5em;
|
216 |
+
}
|
217 |
+
|
218 |
+
header p {
|
219 |
+
font-size: 1em;
|
220 |
+
}
|
221 |
+
}
|
static/js/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
static/js/script.js
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
document.addEventListener("DOMContentLoaded", function () {
|
2 |
+
const logoLink = document.getElementById("logo-link");
|
3 |
+
if (logoLink) {
|
4 |
+
logoLink.addEventListener("click", function () {
|
5 |
+
const logo = document.getElementById("logo");
|
6 |
+
logo.style.transform = "scale(1.5)";
|
7 |
+
setTimeout(() => {
|
8 |
+
logo.style.transform = "scale(1)";
|
9 |
+
}, 500);
|
10 |
+
});
|
11 |
+
}
|
12 |
+
|
13 |
+
function startProgress() {
|
14 |
+
const progressBar = document.getElementById("progress-bar");
|
15 |
+
const progressContainer = document.getElementById("progress-container");
|
16 |
+
const analyzeBtn = document.querySelector("button[type='submit']");
|
17 |
+
|
18 |
+
if (progressBar && progressContainer && analyzeBtn) {
|
19 |
+
progressContainer.style.display = "block";
|
20 |
+
analyzeBtn.disabled = true;
|
21 |
+
analyzeBtn.textContent = "โณ Analysis in progress...";
|
22 |
+
|
23 |
+
let width = 0;
|
24 |
+
const totalTime = 180000; // 3 minutes
|
25 |
+
const intervalTime = totalTime / 100;
|
26 |
+
|
27 |
+
const interval = setInterval(() => {
|
28 |
+
if (width >= 100) {
|
29 |
+
clearInterval(interval);
|
30 |
+
progressBar.textContent = "100%";
|
31 |
+
setTimeout(() => {
|
32 |
+
progressContainer.style.display = "none";
|
33 |
+
progressBar.style.width = "0%";
|
34 |
+
progressBar.textContent = "0%";
|
35 |
+
analyzeBtn.disabled = false;
|
36 |
+
analyzeBtn.textContent = "Analyze";
|
37 |
+
}, 1000);
|
38 |
+
} else {
|
39 |
+
width += 1;
|
40 |
+
progressBar.style.width = width + "%";
|
41 |
+
progressBar.textContent = width + "%";
|
42 |
+
}
|
43 |
+
}, intervalTime);
|
44 |
+
|
45 |
+
// fallback di sicurezza
|
46 |
+
setTimeout(() => {
|
47 |
+
analyzeBtn.disabled = false;
|
48 |
+
analyzeBtn.textContent = "Analyze";
|
49 |
+
progressContainer.style.display = "none";
|
50 |
+
progressBar.style.width = "0%";
|
51 |
+
progressBar.textContent = "0%";
|
52 |
+
}, totalTime + 3000);
|
53 |
+
}
|
54 |
+
}
|
55 |
+
|
56 |
+
window.startProgress = startProgress;
|
57 |
+
|
58 |
+
const analysisForm = document.getElementById("analysisForm");
|
59 |
+
if (analysisForm) {
|
60 |
+
analysisForm.addEventListener("submit", function () {
|
61 |
+
startProgress();
|
62 |
+
});
|
63 |
+
}
|
64 |
+
|
65 |
+
const analysisType = document.getElementById("analysis_type");
|
66 |
+
if (analysisType) {
|
67 |
+
analysisType.addEventListener("change", function () {
|
68 |
+
document.getElementById("pubmed-options").style.display =
|
69 |
+
this.value === "pubmed" ? "block" : "none";
|
70 |
+
document.getElementById("local-options").style.display =
|
71 |
+
this.value === "local" ? "block" : "none";
|
72 |
+
});
|
73 |
+
analysisType.dispatchEvent(new Event("change"));
|
74 |
+
}
|
75 |
+
|
76 |
+
const fileInput = document.getElementById("pdf_file");
|
77 |
+
if (fileInput) {
|
78 |
+
fileInput.addEventListener("change", function () {
|
79 |
+
const fileLabel = document.querySelector('label[for="pdf_file"]');
|
80 |
+
if (fileInput.files.length > 0 && fileLabel) {
|
81 |
+
fileLabel.textContent = `Main PDF selected: ${fileInput.files[0].name}`;
|
82 |
+
}
|
83 |
+
});
|
84 |
+
}
|
85 |
+
|
86 |
+
const comparisonInput = document.getElementById("comparison_files");
|
87 |
+
if (comparisonInput) {
|
88 |
+
comparisonInput.addEventListener("change", function () {
|
89 |
+
const label = document.querySelector('label[for="comparison_files"]');
|
90 |
+
if (comparisonInput.files.length > 0 && label) {
|
91 |
+
label.textContent = `${comparisonInput.files.length} comparison files selected`;
|
92 |
+
}
|
93 |
+
});
|
94 |
+
}
|
95 |
+
|
96 |
+
const flashMessages = document.querySelectorAll(".error");
|
97 |
+
if (flashMessages.length > 0) {
|
98 |
+
setTimeout(() => {
|
99 |
+
flashMessages.forEach(message => message.remove());
|
100 |
+
}, 5000);
|
101 |
+
}
|
102 |
+
});
|
style.css
DELETED
@@ -1,28 +0,0 @@
|
|
1 |
-
body {
|
2 |
-
padding: 2rem;
|
3 |
-
font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
|
4 |
-
}
|
5 |
-
|
6 |
-
h1 {
|
7 |
-
font-size: 16px;
|
8 |
-
margin-top: 0;
|
9 |
-
}
|
10 |
-
|
11 |
-
p {
|
12 |
-
color: rgb(107, 114, 128);
|
13 |
-
font-size: 15px;
|
14 |
-
margin-bottom: 10px;
|
15 |
-
margin-top: 5px;
|
16 |
-
}
|
17 |
-
|
18 |
-
.card {
|
19 |
-
max-width: 620px;
|
20 |
-
margin: 0 auto;
|
21 |
-
padding: 16px;
|
22 |
-
border: 1px solid lightgray;
|
23 |
-
border-radius: 16px;
|
24 |
-
}
|
25 |
-
|
26 |
-
.card p:last-child {
|
27 |
-
margin-bottom: 0;
|
28 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
templates/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
templates/NORUS.html
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8" />
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0"/>
|
6 |
+
<title>NORUS Tool</title>
|
7 |
+
<link rel="stylesheet" href="{{ url_for('static', filename='css/style.css') }}">
|
8 |
+
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
9 |
+
<script src="{{ url_for('static', filename='js/script.js') }}"></script>
|
10 |
+
</head>
|
11 |
+
<body>
|
12 |
+
<header>
|
13 |
+
<div style="text-align: center; margin-top: 20px;">
|
14 |
+
<a href="#" id="logo-link">
|
15 |
+
<img id="logo" src="https://i.imgur.com/MT5Sl9h.png" alt="NORUS Logo" style="width: 150px;" />
|
16 |
+
</a>
|
17 |
+
</div>
|
18 |
+
<h1>NORUS Tool</h1>
|
19 |
+
<p>Analyze your PDF and discover originality and similarity</p>
|
20 |
+
</header>
|
21 |
+
|
22 |
+
<main>
|
23 |
+
<form id="analysisForm" action="/validate" method="POST" enctype="multipart/form-data" onsubmit="startProgress()">
|
24 |
+
<label for="analysis_type">Choose Analysis Type:</label>
|
25 |
+
<select name="analysis_type" id="analysis_type" required>
|
26 |
+
<option value="local">Local Comparison</option>
|
27 |
+
<option value="pubmed">PubMed Search</option>
|
28 |
+
</select>
|
29 |
+
|
30 |
+
<div id="pubmed-options" style="display: none;">
|
31 |
+
<label for="query">PubMed Query:</label>
|
32 |
+
<input type="text" name="query" id="query" />
|
33 |
+
|
34 |
+
<label for="year_start">Start Year:</label>
|
35 |
+
<input type="number" name="year_start" id="year_start" min="1900" max="2025" value="2000" />
|
36 |
+
|
37 |
+
<label for="year_end">End Year:</label>
|
38 |
+
<input type="number" name="year_end" id="year_end" min="1900" max="2025" value="2025" />
|
39 |
+
|
40 |
+
<label for="num_articles">Number of Articles:</label>
|
41 |
+
<input type="number" name="num_articles" id="num_articles" min="1" value="10" />
|
42 |
+
</div>
|
43 |
+
|
44 |
+
<div id="local-options" style="display: none;">
|
45 |
+
<label for="comparison_files">Upload comparison PDFs (select multiple):</label>
|
46 |
+
<input type="file" name="comparison_files" id="comparison_files" multiple />
|
47 |
+
</div>
|
48 |
+
|
49 |
+
<label for="pdf_file">Upload your main PDF:</label>
|
50 |
+
<input type="file" name="pdf_file" id="pdf_file" required />
|
51 |
+
|
52 |
+
<button type="submit">Analyze</button>
|
53 |
+
</form>
|
54 |
+
|
55 |
+
<div id="progress-container" style="display: none;">
|
56 |
+
<p style="text-align: center;">โณ Analysis in progress...</p>
|
57 |
+
<div id="progress-bar">0%</div>
|
58 |
+
</div>
|
59 |
+
|
60 |
+
{% if results %}
|
61 |
+
<section>
|
62 |
+
<h2>Analysis Results</h2>
|
63 |
+
<table>
|
64 |
+
<thead>
|
65 |
+
<tr>
|
66 |
+
<th>Title</th>
|
67 |
+
<th>Semantic Similarity (%)</th>
|
68 |
+
<th>Token Overlap (%)</th>
|
69 |
+
<th>OUI (Originality & Uniqueness Index)</th>
|
70 |
+
</tr>
|
71 |
+
</thead>
|
72 |
+
<tbody>
|
73 |
+
{% for result in results %}
|
74 |
+
<tr>
|
75 |
+
<td style="max-width: 400px; word-wrap: break-word;">{{ result.title }}</td>
|
76 |
+
<td>{{ "%.2f"|format(result.similarity) }}</td>
|
77 |
+
<td>{{ "%.2f"|format(result.token_overlap) }}</td>
|
78 |
+
<td>{{ "%.2f"|format(result.oui) }}</td>
|
79 |
+
</tr>
|
80 |
+
{% endfor %}
|
81 |
+
</tbody>
|
82 |
+
</table>
|
83 |
+
|
84 |
+
{% if keywords %}
|
85 |
+
<div class="results" style="text-align: center; margin-top: 30px;">
|
86 |
+
<h3>๐ Common Keywords</h3>
|
87 |
+
<p>
|
88 |
+
{% for kw, count in keywords %}
|
89 |
+
<span style="margin: 5px; font-weight: bold;">{{ kw }} ({{ count }})</span>
|
90 |
+
{% endfor %}
|
91 |
+
</p>
|
92 |
+
</div>
|
93 |
+
{% endif %}
|
94 |
+
|
95 |
+
<form action="/download_report" method="post" style="text-align: center; margin-top: 30px;">
|
96 |
+
<button type="submit">๐ Download PDF Report</button>
|
97 |
+
</form>
|
98 |
+
|
99 |
+
<div id="chart-container" style="margin-top: 50px;">
|
100 |
+
<canvas id="similarityChart"></canvas>
|
101 |
+
</div>
|
102 |
+
</section>
|
103 |
+
{% endif %}
|
104 |
+
</main>
|
105 |
+
|
106 |
+
<footer><p>© 2025 NORUS Tool. All rights reserved.</p></footer>
|
107 |
+
|
108 |
+
<script>
|
109 |
+
document.addEventListener("DOMContentLoaded", function() {
|
110 |
+
const analysisType = document.getElementById("analysis_type");
|
111 |
+
const pubmedOptions = document.getElementById("pubmed-options");
|
112 |
+
const localOptions = document.getElementById("local-options");
|
113 |
+
|
114 |
+
function toggleOptions() {
|
115 |
+
if (analysisType.value === "pubmed") {
|
116 |
+
pubmedOptions.style.display = "block";
|
117 |
+
localOptions.style.display = "none";
|
118 |
+
} else {
|
119 |
+
pubmedOptions.style.display = "none";
|
120 |
+
localOptions.style.display = "block";
|
121 |
+
}
|
122 |
+
}
|
123 |
+
|
124 |
+
analysisType.addEventListener("change", toggleOptions);
|
125 |
+
toggleOptions();
|
126 |
+
});
|
127 |
+
</script>
|
128 |
+
|
129 |
+
{% if results %}
|
130 |
+
<script>
|
131 |
+
new Chart(document.getElementById('similarityChart'), {
|
132 |
+
type: 'bar',
|
133 |
+
data: {
|
134 |
+
labels: {{ results | map(attribute='title') | list | safe }},
|
135 |
+
datasets: [
|
136 |
+
{
|
137 |
+
label: 'Semantic Similarity (%)',
|
138 |
+
data: {{ results | map(attribute='similarity') | list | safe }},
|
139 |
+
backgroundColor: 'rgba(54, 162, 235, 0.7)',
|
140 |
+
borderColor: 'rgba(54, 162, 235, 1)',
|
141 |
+
borderWidth: 1
|
142 |
+
},
|
143 |
+
{
|
144 |
+
label: 'Token Overlap (%)',
|
145 |
+
data: {{ results | map(attribute='token_overlap') | list | safe }},
|
146 |
+
backgroundColor: 'rgba(255, 159, 64, 0.7)',
|
147 |
+
borderColor: 'rgba(255, 159, 64, 1)',
|
148 |
+
borderWidth: 1
|
149 |
+
},
|
150 |
+
{
|
151 |
+
label: 'OUI (%)',
|
152 |
+
data: {{ results | map(attribute='oui') | list | safe }},
|
153 |
+
backgroundColor: 'rgba(153, 102, 255, 0.7)',
|
154 |
+
borderColor: 'rgba(153, 102, 255, 1)',
|
155 |
+
borderWidth: 1
|
156 |
+
}
|
157 |
+
]
|
158 |
+
},
|
159 |
+
options: {
|
160 |
+
responsive: true,
|
161 |
+
plugins: {
|
162 |
+
legend: { position: 'top' },
|
163 |
+
tooltip: { mode: 'index', intersect: false }
|
164 |
+
},
|
165 |
+
scales: {
|
166 |
+
y: { beginAtZero: true },
|
167 |
+
x: {
|
168 |
+
ticks: {
|
169 |
+
autoSkip: false,
|
170 |
+
maxRotation: 45,
|
171 |
+
minRotation: 45
|
172 |
+
}
|
173 |
+
}
|
174 |
+
}
|
175 |
+
}
|
176 |
+
});
|
177 |
+
</script>
|
178 |
+
{% endif %}
|
179 |
+
</body>
|
180 |
+
</html>
|
templates/app.py
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import requests
|
3 |
+
import pdfplumber
|
4 |
+
import numpy as np
|
5 |
+
from flask import Flask, render_template, request, redirect, url_for, flash
|
6 |
+
from werkzeug.utils import secure_filename
|
7 |
+
from sentence_transformers import SentenceTransformer, util
|
8 |
+
import nltk
|
9 |
+
from nltk.stem import WordNetLemmatizer, PorterStemmer
|
10 |
+
from nltk.tokenize import word_tokenize
|
11 |
+
from nltk.corpus import stopwords
|
12 |
+
|
13 |
+
nltk.download("punkt")
|
14 |
+
nltk.download("wordnet")
|
15 |
+
nltk.download("stopwords")
|
16 |
+
|
17 |
+
lemmatizer = WordNetLemmatizer()
|
18 |
+
stemmer = PorterStemmer()
|
19 |
+
stop_words = set(stopwords.words("english"))
|
20 |
+
|
21 |
+
app = Flask(__name__)
|
22 |
+
app.config["UPLOAD_FOLDER"] = "uploads"
|
23 |
+
os.makedirs(app.config["UPLOAD_FOLDER"], exist_ok=True)
|
24 |
+
|
25 |
+
model = SentenceTransformer("allenai/scibert_scivocab_uncased")
|
26 |
+
|
27 |
+
def extract_pdf_text(pdf_path):
|
28 |
+
text = ""
|
29 |
+
try:
|
30 |
+
with pdfplumber.open(pdf_path) as pdf:
|
31 |
+
for page in pdf.pages:
|
32 |
+
text += page.extract_text() or " "
|
33 |
+
except Exception as e:
|
34 |
+
print(f"Errore estrazione testo: {e}")
|
35 |
+
return text.lower().strip()
|
36 |
+
|
37 |
+
def preprocess_text(text):
|
38 |
+
text = text.lower()
|
39 |
+
words = word_tokenize(text)
|
40 |
+
words = [stemmer.stem(lemmatizer.lemmatize(w)) for w in words if w.isalnum() and w not in stop_words]
|
41 |
+
return " ".join(words)
|
42 |
+
|
43 |
+
def calculate_token_overlap(text1, text2):
|
44 |
+
tokens1 = set(text1.split())
|
45 |
+
tokens2 = set(text2.split())
|
46 |
+
overlap = len(tokens1 & tokens2)
|
47 |
+
return round((overlap / max(len(tokens1), 1)) * 100, 2)
|
48 |
+
|
49 |
+
def calculate_oui(similarity, token_overlap, alpha=0.7, beta=0.3):
|
50 |
+
oui = alpha * (1 - similarity / 100) + beta * (1 - token_overlap / 100)
|
51 |
+
return round(max(0, min(oui * 100, 100)), 2)
|
52 |
+
|
53 |
+
def validate_document(pdf_path, comparison_sources, method="local", titles=None):
|
54 |
+
pdf_text = extract_pdf_text(pdf_path)
|
55 |
+
results = []
|
56 |
+
for i, doc in enumerate(comparison_sources):
|
57 |
+
doc_text = extract_pdf_text(doc) if method == "local" else doc
|
58 |
+
similarity = util.pytorch_cos_sim(
|
59 |
+
model.encode(pdf_text, convert_to_tensor=True),
|
60 |
+
model.encode(doc_text, convert_to_tensor=True)
|
61 |
+
).item() * 100
|
62 |
+
token_overlap = calculate_token_overlap(pdf_text, doc_text)
|
63 |
+
oui = calculate_oui(similarity, token_overlap)
|
64 |
+
title = titles[i] if titles and i < len(titles) else os.path.basename(doc) if method == "local" else "Unknown Title"
|
65 |
+
results.append({"title": title, "similarity": round(similarity, 2), "token_overlap": round(token_overlap, 2), "oui": round(oui, 2)})
|
66 |
+
return results
|
67 |
+
|
68 |
+
def fetch_pubmed_details(article_id):
|
69 |
+
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
|
70 |
+
params = {"db": "pubmed", "id": article_id, "retmode": "xml"}
|
71 |
+
try:
|
72 |
+
response = requests.get(base_url, params=params)
|
73 |
+
response.raise_for_status()
|
74 |
+
import xml.etree.ElementTree as ET
|
75 |
+
root = ET.fromstring(response.text)
|
76 |
+
title = root.find(".//ArticleTitle").text if root.find(".//ArticleTitle") is not None else "No Title"
|
77 |
+
abstract = root.find(".//AbstractText").text if root.find(".//AbstractText") is not None else "No Abstract"
|
78 |
+
keywords = root.findall(".//Keyword")
|
79 |
+
keyword_text = " ".join([kw.text for kw in keywords if kw.text]) if keywords else "No Keywords"
|
80 |
+
print(f"\n๐ ARTICOLO RECUPERATO\n๐ Titolo: {title}\n๐ Abstract: {abstract[:500]}...\n๐ Keywords: {keyword_text}\n")
|
81 |
+
return title, f"{abstract} {keyword_text}"
|
82 |
+
except requests.exceptions.RequestException as e:
|
83 |
+
print(f"Errore recupero abstract: {e}")
|
84 |
+
return "No Title", "No Abstract"
|
85 |
+
|
86 |
+
def fetch_pubmed(query, year_start, year_end, max_results=10):
|
87 |
+
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
|
88 |
+
params = {"db": "pubmed", "term": f"{query} AND ({year_start}[PDAT] : {year_end}[PDAT])", "retmax": max_results, "retmode": "json"}
|
89 |
+
try:
|
90 |
+
response = requests.get(base_url, params=params)
|
91 |
+
response.raise_for_status()
|
92 |
+
return response.json().get("esearchresult", {}).get("idlist", [])
|
93 |
+
except requests.exceptions.RequestException as e:
|
94 |
+
print(f"Errore recupero articoli PubMed: {e}")
|
95 |
+
|
96 |
+
return []
|
97 |
+
|
98 |
+
@app.route("/")
|
99 |
+
def index():
|
100 |
+
return render_template("NORUS.html")
|
101 |
+
|
102 |
+
@app.route("/validate", methods=["POST"])
|
103 |
+
def validate():
|
104 |
+
pdf_file = request.files.get("pdf_file")
|
105 |
+
analysis_type = request.form.get("analysis_type")
|
106 |
+
local_dir = request.form.get("local_directory", "").strip()
|
107 |
+
query = request.form.get("query", "").strip()
|
108 |
+
if not pdf_file:
|
109 |
+
flash("Carica un file PDF valido.", "error")
|
110 |
+
return redirect(url_for("index"))
|
111 |
+
filename = secure_filename(pdf_file.filename)
|
112 |
+
pdf_path = os.path.join(app.config["UPLOAD_FOLDER"], filename)
|
113 |
+
pdf_file.save(pdf_path)
|
114 |
+
results = []
|
115 |
+
if analysis_type == "local":
|
116 |
+
if not os.path.isdir(local_dir):
|
117 |
+
flash("Seleziona una directory valida.", "error")
|
118 |
+
return redirect(url_for("index"))
|
119 |
+
comparison_files = [os.path.join(local_dir, f) for f in os.listdir(local_dir) if f.endswith(".pdf")]
|
120 |
+
if not comparison_files:
|
121 |
+
flash("La directory non contiene PDF.", "error")
|
122 |
+
return redirect(url_for("index"))
|
123 |
+
results = validate_document(pdf_path, comparison_files, method="local")
|
124 |
+
elif analysis_type == "pubmed":
|
125 |
+
year_start = request.form.get("year_start", "2000")
|
126 |
+
year_end = request.form.get("year_end", "2025")
|
127 |
+
num_articles = int(request.form.get("num_articles", "10"))
|
128 |
+
pubmed_ids = fetch_pubmed(query, year_start, year_end, num_articles)
|
129 |
+
pubmed_results = [fetch_pubmed_details(article_id) for article_id in pubmed_ids]
|
130 |
+
results = validate_document(pdf_path, [result[1] for result in pubmed_results], method="pubmed", titles=[result[0] for result in pubmed_results])
|
131 |
+
return render_template("NORUS.html", results=results)
|
132 |
+
|
133 |
+
if __name__ == "__main__":
|
134 |
+
app.run(debug=True, port=7860)
|