mabil commited on
Commit
8c42b67
Β·
0 Parent(s):

πŸš€ Deploy NORUS tool completo su Hugging Face senza binary files

Browse files
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ static/images/norus.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ .DS_Store
4
+ venv/
5
+ uploads/
6
+ static/images/norus.png
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ RUN useradd -m -u 1000 user
4
+ USER user
5
+ ENV PATH="/home/user/.local/bin:$PATH"
6
+
7
+ WORKDIR /app
8
+
9
+ COPY --chown=user ./requirements.txt requirements.txt
10
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
11
+
12
+ COPY --chown=user . /app
13
+ CMD ["python", "app.py"]
README.md ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # NORUS Tool πŸ§ πŸ“„
2
+
3
+ **NORUS** (Novelty and Originality Recognition Utility System) Γ¨ uno strumento basato su intelligenza artificiale che consente l'analisi semantica di articoli scientifici in formato PDF, confrontandoli con articoli locali o pubblicati su PubMed. Il tool restituisce misure di **similaritΓ  semantica**, **token overlap** e un indice composito chiamato **OUI (Originality & Uniqueness Index)**.
4
+
5
+ ## πŸš€ FunzionalitΓ  principali
6
+
7
+ - βœ… Caricamento PDF da analizzare
8
+ - πŸ“‚ Confronto con PDF locali o articoli da PubMed
9
+ - πŸ€– Estrazione di embedding semantici tramite SciBERT
10
+ - πŸ“Š Calcolo di:
11
+ - SimilaritΓ  semantica (cosine similarity)
12
+ - Sovrapposizione testuale (token overlap)
13
+ - Indice OUI (originalitΓ  e novitΓ )
14
+ - πŸ“ˆ Visualizzazione interattiva dei risultati via Chart.js
15
+
16
+ ## πŸ§ͺ OUI - Originality & Uniqueness Index
17
+
18
+ \`\`\`math
19
+ OUI = 1 - (Ξ± Γ— semantic_similarity + Ξ² Γ— token_overlap)
20
+ \`\`\`
21
+
22
+ - Ξ± = 0.7 β†’ penalizza la somiglianza semantica
23
+ - Ξ² = 0.3 β†’ penalizza la ripetizione letterale
24
+ - L'OUI misura **quanto un documento Γ¨ originale**, sia nel contenuto che nella forma.
25
+
26
+ ## 🧱 Architettura
27
+
28
+ - `Flask` come backend web
29
+ - `pdfplumber` per l'estrazione del testo dai PDF
30
+ - `nltk` per preprocessing linguistico
31
+ - `sentence-transformers` con modello `allenai/scibert_scivocab_uncased`
32
+ - `requests` per l'interfaccia con PubMed
33
+
34
+ ## πŸ“‚ Struttura del progetto
35
+
36
+ ```
37
+ .
38
+ β”œβ”€β”€ app.py
39
+ β”œβ”€β”€ Dockerfile
40
+ β”œβ”€β”€ requirements.txt
41
+ β”œβ”€β”€ static/
42
+ β”œβ”€β”€ templates/
43
+ β”œβ”€β”€ uploads/
44
+ β”œβ”€β”€ README.md
45
+ ```
46
+
47
+ ## ▢️ Esecuzione locale
48
+
49
+ Per eseguire localmente:
50
+
51
+ 1. Assicurati di avere Python 3.9+
52
+ 2. Installa le dipendenze:
53
+
54
+ \`\`\`bash
55
+ pip install -r requirements.txt
56
+ \`\`\`
57
+
58
+ 3. Avvia l'app:
59
+
60
+ \`\`\`bash
61
+ python app.py
62
+ \`\`\`
63
+
64
+ Apri il browser su `http://localhost:7860`
65
+
66
+ ## πŸ“‘ Deploy su Hugging Face Spaces
67
+
68
+ Puoi caricare questo progetto come Space Docker-based su Hugging Face. Il `Dockerfile` Γ¨ giΓ  configurato.
69
+
70
+ ---
71
+ Developed by Marina Bilotta – Computational Chemistry & AI Research
app.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ import pdfplumber
4
+ import numpy as np
5
+ from flask import Flask, render_template, request, redirect, url_for, flash
6
+ from werkzeug.utils import secure_filename
7
+ from sentence_transformers import SentenceTransformer, util
8
+ import nltk
9
+ from nltk.stem import WordNetLemmatizer, PorterStemmer
10
+ from nltk.tokenize import word_tokenize
11
+ from nltk.corpus import stopwords
12
+
13
+ nltk.download("punkt")
14
+ nltk.download("wordnet")
15
+ nltk.download("stopwords")
16
+
17
+ lemmatizer = WordNetLemmatizer()
18
+ stemmer = PorterStemmer()
19
+ stop_words = set(stopwords.words("english"))
20
+
21
+ app = Flask(__name__)
22
+ app.config["UPLOAD_FOLDER"] = "uploads"
23
+ os.makedirs(app.config["UPLOAD_FOLDER"], exist_ok=True)
24
+
25
+ model = SentenceTransformer("allenai/scibert_scivocab_uncased")
26
+
27
+ def extract_pdf_text(pdf_path):
28
+ text = ""
29
+ try:
30
+ with pdfplumber.open(pdf_path) as pdf:
31
+ for page in pdf.pages:
32
+ text += page.extract_text() or " "
33
+ except Exception as e:
34
+ print(f"Errore estrazione testo: {e}")
35
+ return text.lower().strip()
36
+
37
+ def preprocess_text(text):
38
+ text = text.lower()
39
+ words = word_tokenize(text)
40
+ words = [stemmer.stem(lemmatizer.lemmatize(w)) for w in words if w.isalnum() and w not in stop_words]
41
+ return " ".join(words)
42
+
43
+ def calculate_token_overlap(text1, text2):
44
+ tokens1 = set(text1.split())
45
+ tokens2 = set(text2.split())
46
+ overlap = len(tokens1 & tokens2)
47
+ return round((overlap / max(len(tokens1), 1)) * 100, 2)
48
+
49
+ def calculate_oui(similarity, token_overlap, alpha=0.7, beta=0.3):
50
+ oui = alpha * (1 - similarity / 100) + beta * (1 - token_overlap / 100)
51
+ return round(max(0, min(oui * 100, 100)), 2)
52
+
53
+ def validate_document(pdf_path, comparison_sources, method="local", titles=None):
54
+ pdf_text = extract_pdf_text(pdf_path)
55
+ results = []
56
+ for i, doc in enumerate(comparison_sources):
57
+ doc_text = extract_pdf_text(doc) if method == "local" else doc
58
+ similarity = util.pytorch_cos_sim(
59
+ model.encode(pdf_text, convert_to_tensor=True),
60
+ model.encode(doc_text, convert_to_tensor=True)
61
+ ).item() * 100
62
+ token_overlap = calculate_token_overlap(pdf_text, doc_text)
63
+ oui = calculate_oui(similarity, token_overlap)
64
+ title = titles[i] if titles and i < len(titles) else os.path.basename(doc) if method == "local" else "Unknown Title"
65
+ results.append({"title": title, "similarity": round(similarity, 2), "token_overlap": round(token_overlap, 2), "oui": round(oui, 2)})
66
+ return results
67
+
68
+ def fetch_pubmed_details(article_id):
69
+ base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
70
+ params = {"db": "pubmed", "id": article_id, "retmode": "xml"}
71
+ try:
72
+ response = requests.get(base_url, params=params)
73
+ response.raise_for_status()
74
+ import xml.etree.ElementTree as ET
75
+ root = ET.fromstring(response.text)
76
+ title = root.find(".//ArticleTitle").text if root.find(".//ArticleTitle") is not None else "No Title"
77
+ abstract = root.find(".//AbstractText").text if root.find(".//AbstractText") is not None else "No Abstract"
78
+ keywords = root.findall(".//Keyword")
79
+ keyword_text = " ".join([kw.text for kw in keywords if kw.text]) if keywords else "No Keywords"
80
+ print(f"\nπŸ” ARTICOLO RECUPERATO\nπŸ“– Titolo: {title}\nπŸ“ Abstract: {abstract[:500]}...\nπŸ”‘ Keywords: {keyword_text}\n")
81
+ return title, f"{abstract} {keyword_text}"
82
+ except requests.exceptions.RequestException as e:
83
+ print(f"Errore recupero abstract: {e}")
84
+ return "No Title", "No Abstract"
85
+
86
+ def fetch_pubmed(query, year_start, year_end, max_results=10):
87
+ base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
88
+ params = {"db": "pubmed", "term": f"{query} AND ({year_start}[PDAT] : {year_end}[PDAT])", "retmax": max_results, "retmode": "json"}
89
+ try:
90
+ response = requests.get(base_url, params=params)
91
+ response.raise_for_status()
92
+ return response.json().get("esearchresult", {}).get("idlist", [])
93
+ except requests.exceptions.RequestException as e:
94
+ print(f"Errore recupero articoli PubMed: {e}")
95
+ return []
96
+
97
+ @app.route("/")
98
+ def index():
99
+ return render_template("NORUS.html")
100
+
101
+ @app.route("/validate", methods=["POST"])
102
+ def validate():
103
+ pdf_file = request.files.get("pdf_file")
104
+ analysis_type = request.form.get("analysis_type")
105
+ local_dir = request.form.get("local_directory", "").strip()
106
+ query = request.form.get("query", "").strip()
107
+ if not pdf_file:
108
+ flash("Carica un file PDF valido.", "error")
109
+ return redirect(url_for("index"))
110
+ filename = secure_filename(pdf_file.filename)
111
+ pdf_path = os.path.join(app.config["UPLOAD_FOLDER"], filename)
112
+ pdf_file.save(pdf_path)
113
+ results = []
114
+ if analysis_type == "local":
115
+ if not os.path.isdir(local_dir):
116
+ flash("Seleziona una directory valida.", "error")
117
+ return redirect(url_for("index"))
118
+ comparison_files = [os.path.join(local_dir, f) for f in os.listdir(local_dir) if f.endswith(".pdf")]
119
+ if not comparison_files:
120
+ flash("La directory non contiene PDF.", "error")
121
+ return redirect(url_for("index"))
122
+ results = validate_document(pdf_path, comparison_files, method="local")
123
+ elif analysis_type == "pubmed":
124
+ year_start = request.form.get("year_start", "2000")
125
+ year_end = request.form.get("year_end", "2025")
126
+ num_articles = int(request.form.get("num_articles", "10"))
127
+ pubmed_ids = fetch_pubmed(query, year_start, year_end, num_articles)
128
+ pubmed_results = [fetch_pubmed_details(article_id) for article_id in pubmed_ids]
129
+ results = validate_document(pdf_path, [result[1] for result in pubmed_results], method="pubmed", titles=[result[0] for result in pubmed_results])
130
+ return render_template("NORUS.html", results=results)
131
+
132
+ if __name__ == "__main__":
133
+ app.run(debug=True, host="0.0.0.0", port=7860)
models/similarity_model.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.feature_extraction.text import TfidfVectorizer
2
+ from sklearn.metrics.pairwise import cosine_similarity
3
+
4
+ def compute_similarity(text1, text2):
5
+ """
6
+ Calcola la similaritΓ  del coseno tra due testi usando TF-IDF.
7
+
8
+ Parametri:
9
+ text1 (str): Primo testo.
10
+ text2 (str): Secondo testo.
11
+
12
+ Ritorna:
13
+ float: Valore di similaritΓ  (0 a 1).
14
+ """
15
+ try:
16
+ # Verifica che i testi non siano vuoti
17
+ if not text1.strip() or not text2.strip():
18
+ raise ValueError("Uno o entrambi i testi sono vuoti.")
19
+
20
+ # Vettorizzazione con TF-IDF
21
+ vectorizer = TfidfVectorizer(stop_words='english')
22
+ tfidf_matrix = vectorizer.fit_transform([text1, text2])
23
+
24
+ # Calcolo della similaritΓ  del coseno
25
+ similarity_matrix = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
26
+
27
+ return similarity_matrix[0][0] # Ritorna il valore della similaritΓ 
28
+
29
+ except Exception as e:
30
+ print(f"Errore durante il calcolo della similaritΓ : {e}")
31
+ return None
32
+
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ flask
2
+ pdfplumber
3
+ nltk
4
+ sentence-transformers
5
+ scikit-learn
6
+ pandas
7
+ reportlab
8
+ matplotlib
9
+ requests
10
+ keybert
11
+ torch
12
+ transformers
start_local.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ echo "βš™οΈ Avvio dell'ambiente NORUS..."
3
+ python3 -m venv venv
4
+ source venv/bin/activate
5
+ pip install --upgrade pip
6
+ pip install -r requirements.txt
7
+ echo "βœ… Ambiente pronto. Avvio del server Flask..."
8
+ python app.py
static/css/style.css ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Generale */
2
+ html, body {
3
+ height: 100%;
4
+ margin: 0;
5
+ padding: 0;
6
+ overflow-y: auto; /* Permette lo scrolling */
7
+ }
8
+
9
+ body {
10
+ font-family: Arial, sans-serif;
11
+ background-color: #f8f8f8;
12
+ color: #333;
13
+ display: flex;
14
+ flex-direction: column;
15
+ min-height: 100vh;
16
+ }
17
+
18
+ /* Header */
19
+ header {
20
+ background-color: rgba(42, 77, 111, 0.8); /* Aggiunge trasparenza */
21
+ color: #fff;
22
+ padding: 20px;
23
+ text-align: center;
24
+ position: relative;
25
+ }
26
+
27
+ header h1 {
28
+ margin-bottom: 10px;
29
+ font-size: 2em;
30
+ }
31
+
32
+ header p {
33
+ font-size: 1.2em;
34
+ margin-top: 0;
35
+ }
36
+
37
+ /* Logo */
38
+ #logo {
39
+ max-width: 200px;
40
+ cursor: pointer;
41
+ transition: transform 0.3s ease;
42
+ }
43
+
44
+ #logo:hover {
45
+ transform: scale(1.2);
46
+ }
47
+
48
+ /* Container principale */
49
+ .container {
50
+ flex: 1;
51
+ width: 100%;
52
+ display: flex;
53
+ flex-direction: column;
54
+ align-items: center;
55
+ padding: 20px;
56
+ }
57
+
58
+ /* Form */
59
+ form {
60
+ margin: 20px auto;
61
+ text-align: center;
62
+ width: 80%;
63
+ max-width: 800px;
64
+ padding: 25px;
65
+ background-color: #fff;
66
+ border-radius: 10px;
67
+ box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1);
68
+ overflow-y: auto; /* Permette scrolling interno se necessario */
69
+ max-height: 90vh; /* Impedisce che il form esca dallo schermo */
70
+ }
71
+
72
+ /* Etichette e campi input */
73
+ label {
74
+ display: block;
75
+ font-size: 1.1em;
76
+ margin-bottom: 8px;
77
+ font-weight: bold;
78
+ }
79
+
80
+ input, select {
81
+ width: 100%;
82
+ padding: 10px;
83
+ margin-bottom: 15px;
84
+ border: 1px solid #ccc;
85
+ border-radius: 5px;
86
+ font-size: 1.1em;
87
+ }
88
+
89
+ /* Pulsanti */
90
+ button {
91
+ width: 100%;
92
+ background-color: #2a4d6f;
93
+ color: #fff;
94
+ padding: 12px 20px;
95
+ border: none;
96
+ border-radius: 5px;
97
+ cursor: pointer;
98
+ font-size: 1.2em;
99
+ transition: background-color 0.3s;
100
+ margin-top: 10px;
101
+ }
102
+
103
+ button:hover {
104
+ background-color: #1a3d56;
105
+ }
106
+
107
+ /* Sezione Risultati */
108
+ .results {
109
+ padding: 25px;
110
+ background-color: #fff;
111
+ margin: 30px auto;
112
+ border-radius: 10px;
113
+ box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1);
114
+ max-width: 1000px;
115
+ overflow-y: auto;
116
+ max-height: 80vh;
117
+ }
118
+
119
+ /* Tabelle */
120
+ table {
121
+ width: 100%;
122
+ border-collapse: collapse;
123
+ margin-top: 25px;
124
+ }
125
+
126
+ th {
127
+ background-color: #2a4d6f;
128
+ color: #fff;
129
+ padding: 12px;
130
+ text-align: left;
131
+ }
132
+
133
+ td {
134
+ padding: 12px;
135
+ border-bottom: 1px solid #ddd;
136
+ background-color: #f8f9fa;
137
+ }
138
+
139
+ table tr:hover {
140
+ background-color: #f1f1f1;
141
+ }
142
+
143
+ table th, table td {
144
+ font-size: 1.1em;
145
+ }
146
+
147
+ /* Contenitore del grafico */
148
+ #chart-container {
149
+ width: 90%;
150
+ max-width: 1000px;
151
+ height: 600px;
152
+ margin: 40px auto;
153
+ padding-bottom: 20px;
154
+ }
155
+
156
+ canvas {
157
+ width: 100% !important;
158
+ height: 100% !important;
159
+ display: block;
160
+ }
161
+
162
+ #logo {
163
+ display: block; /* Assicura che il logo sia trattato come un blocco */
164
+ margin: 0 auto; /* Lo centra orizzontalmente */
165
+ max-width: 200px;
166
+ height: auto;
167
+ cursor: pointer;
168
+ transition: transform 0.3s ease;
169
+ position: relative; /* Evita sovrapposizioni strane */
170
+ z-index: 10; /* Assicura che sia sopra altri elementi */
171
+ }
172
+
173
+ /* Barra di caricamento */
174
+ #progress-container {
175
+ width: 100%;
176
+ background-color: #f3f3f3;
177
+ border-radius: 25px;
178
+ overflow: hidden;
179
+ margin: 20px 0;
180
+ }
181
+
182
+ #progress-bar {
183
+ height: 20px;
184
+ width: 0;
185
+ background-color: #4caf50;
186
+ text-align: center;
187
+ line-height: 20px;
188
+ color: white;
189
+ }
190
+
191
+ /* Footer */
192
+ footer {
193
+ background-color: #2a4d6f;
194
+ color: #fff;
195
+ text-align: center;
196
+ padding: 15px;
197
+ width: 100%;
198
+ font-size: 1.1em;
199
+ margin-top: auto; /* Il footer si posiziona in fondo alla pagina */
200
+ }
static/js/script.js ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ document.addEventListener("DOMContentLoaded", function () {
2
+ // Interazione con il logo
3
+ const logoLink = document.getElementById("logo-link");
4
+ if (logoLink) {
5
+ logoLink.addEventListener("click", function () {
6
+ const logo = document.getElementById("logo");
7
+ logo.style.transform = "scale(1.5)";
8
+ setTimeout(() => {
9
+ logo.style.transform = "scale(1)";
10
+ }, 500);
11
+ });
12
+ }
13
+
14
+ // Barra di avanzamento durante l'analisi
15
+ function startProgress() {
16
+ const progressBar = document.getElementById("progress-bar");
17
+ const progressContainer = document.getElementById("progress-container");
18
+ if (progressBar && progressContainer) {
19
+ progressContainer.style.display = "block";
20
+ let width = 0;
21
+ const interval = setInterval(() => {
22
+ if (width >= 100) {
23
+ clearInterval(interval);
24
+ } else {
25
+ width++;
26
+ progressBar.style.width = width + "%";
27
+ progressBar.textContent = width + "%";
28
+ }
29
+ }, 100);
30
+ }
31
+ }
32
+
33
+ // Inizializza il form per avviare la progress bar
34
+ const analysisForm = document.getElementById("analysisForm");
35
+ if (analysisForm) {
36
+ analysisForm.addEventListener("submit", function () {
37
+ startProgress();
38
+ });
39
+ }
40
+
41
+ // Configurazione e gestione dei grafici di SimilaritΓ , Token Overlap e OUI
42
+ const chartCanvas = document.getElementById("similarityChart");
43
+ if (chartCanvas) {
44
+ const ctx = chartCanvas.getContext("2d");
45
+ new Chart(ctx, {
46
+ type: "bar",
47
+ data: {
48
+ labels: {{ results | map(attribute='title') | list | safe }},
49
+ datasets: [
50
+ {
51
+ label: "Semantic Similarity (%)",
52
+ data: {{ results | map(attribute='similarity') | list | safe }},
53
+ backgroundColor: "rgba(54, 162, 235, 0.7)",
54
+ borderColor: "rgba(54, 162, 235, 1)",
55
+ borderWidth: 1
56
+ },
57
+ {
58
+ label: "Token Overlap (%)",
59
+ data: {{ results | map(attribute='token_overlap') | list | safe }},
60
+ backgroundColor: "rgba(255, 159, 64, 0.7)",
61
+ borderColor: "rgba(255, 159, 64, 1)",
62
+ borderWidth: 1
63
+ },
64
+ {
65
+ label: "OUI (%)",
66
+ data: {{ results | map(attribute='oui') | list | safe }},
67
+ backgroundColor: "rgba(153, 102, 255, 0.7)",
68
+ borderColor: "rgba(153, 102, 255, 1)",
69
+ borderWidth: 1
70
+ }
71
+ ]
72
+ },
73
+ options: {
74
+ responsive: true,
75
+ maintainAspectRatio: false,
76
+ plugins: {
77
+ legend: { position: "top" },
78
+ tooltip: { mode: "index", intersect: false }
79
+ },
80
+ scales: {
81
+ y: { beginAtZero: true },
82
+ x: {
83
+ ticks: { autoSkip: false, maxRotation: 45, minRotation: 45 }
84
+ }
85
+ }
86
+ }
87
+ });
88
+ }
89
+
90
+ // Gestione del caricamento del file
91
+ const fileInput = document.getElementById("pdf_file");
92
+ if (fileInput) {
93
+ fileInput.addEventListener("change", function () {
94
+ const fileLabel = document.querySelector('label[for="pdf_file"]');
95
+ if (fileInput.files.length > 0 && fileLabel) {
96
+ fileLabel.textContent = `File selected: ${fileInput.files[0].name}`;
97
+ }
98
+ });
99
+ }
100
+
101
+ // Miglioramento dell'usabilitΓ  dei messaggi di errore
102
+ const flashMessages = document.querySelectorAll(".error");
103
+ if (flashMessages.length > 0) {
104
+ setTimeout(() => {
105
+ flashMessages.forEach(message => message.remove());
106
+ }, 5000);
107
+ }
108
+
109
+ // Permettere la scelta tra analisi locale e PubMed
110
+ const analysisType = document.getElementById("analysis_type");
111
+ if (analysisType) {
112
+ analysisType.addEventListener("change", function () {
113
+ document.getElementById("pubmed-options").style.display =
114
+ this.value === "pubmed" ? "block" : "none";
115
+ document.getElementById("local-options").style.display =
116
+ this.value === "local" ? "block" : "none";
117
+ });
118
+ }
119
+ });
templates/NORUS.html ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>NORUS Tool</title>
7
+ <link rel="stylesheet" href="{{ url_for('static', filename='css/style.css') }}">
8
+ <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
9
+ <script src="{{ url_for('static', filename='js/script.js') }}"></script>
10
+ </head>
11
+ <body>
12
+ <header>
13
+ <div style="text-align: center; margin-top: 20px;">
14
+ <a href="#" id="logo-link">
15
+ <img src="{{ url_for('static', filename='images/NORUS.PNG') }}" alt="NORUS Logo" id="logo" style="max-width: 200px; height: auto;">
16
+ </a>
17
+ </div>
18
+ <h1>NORUS Tool</h1>
19
+ <p>Analyze your PDF and discover originality and similarity</p>
20
+ </header>
21
+ <main>
22
+ <form action="/validate" method="POST" enctype="multipart/form-data" onsubmit="startProgress()">
23
+ <label for="analysis_type">Choose Analysis Type:</label>
24
+ <select name="analysis_type" id="analysis_type" required>
25
+ <option value="local">Local Directory</option>
26
+ <option value="pubmed">PubMed Search</option>
27
+ </select>
28
+
29
+ <div id="pubmed-options" style="display: none;">
30
+ <label for="query">PubMed Query:</label>
31
+ <input type="text" name="query" id="query">
32
+
33
+ <label for="year_start">Start Year:</label>
34
+ <input type="number" name="year_start" id="year_start" min="1900" max="2025" value="2000">
35
+
36
+ <label for="year_end">End Year:</label>
37
+ <input type="number" name="year_end" id="year_end" min="1900" max="2025" value="2025">
38
+
39
+ <label for="num_articles">Number of Articles:</label>
40
+ <input type="number" name="num_articles" id="num_articles" min="1" value="10">
41
+ </div>
42
+
43
+ <div id="local-options" style="display: none;">
44
+ <label for="local_directory">Select Local Directory:</label>
45
+ <input type="text" name="local_directory" id="local_directory" placeholder="Enter directory path">
46
+ </div>
47
+
48
+ <label for="pdf_file">Upload PDF:</label>
49
+ <input type="file" name="pdf_file" id="pdf_file" required>
50
+
51
+ <button type="submit">Analyze</button>
52
+ </form>
53
+
54
+ <div id="progress-container" style="display: none;">
55
+ <div id="progress-bar">0%</div>
56
+ </div>
57
+
58
+ {% if results %}
59
+ <section>
60
+ <h2>Analysis Results</h2>
61
+ <table>
62
+ <thead>
63
+ <tr>
64
+ <th>Title</th>
65
+ <th>Semantic Similarity (%)</th>
66
+ <th>Token Overlap (%)</th>
67
+ <th>OUI (Originality & Uniqueness Index)</th>
68
+ </tr>
69
+ </thead>
70
+ <tbody>
71
+ {% for result in results %}
72
+ <tr>
73
+ <td style="max-width: 400px; word-wrap: break-word;">{{ result.title }}</td>
74
+ <td>{{ "%.2f"|format(result.similarity) }}</td>
75
+ <td>{{ "%.2f"|format(result.token_overlap) }}</td>
76
+ <td>{{ "%.2f"|format(result.oui * 100) }}</td>
77
+ </tr>
78
+ {% endfor %}
79
+ </tbody>
80
+ </table>
81
+
82
+ <div id="chart-container" style="margin-top: 50px;">
83
+ <canvas id="similarityChart"></canvas>
84
+ </div>
85
+
86
+ <script>
87
+ const labels = {{ results | map(attribute='title') | list | safe }};
88
+ const semanticData = {{ results | map(attribute='similarity') | list | safe }};
89
+ const tokenData = {{ results | map(attribute='token_overlap') | list | safe }};
90
+ const ouiData = {{ results | map(attribute='oui') | list | safe }}.map(x => x * 100);
91
+
92
+ new Chart(document.getElementById('similarityChart'), {
93
+ type: 'bar',
94
+ data: {
95
+ labels: labels,
96
+ datasets: [
97
+ {
98
+ label: 'Semantic Similarity (%)',
99
+ data: semanticData,
100
+ backgroundColor: 'rgba(54, 162, 235, 0.7)',
101
+ borderColor: 'rgba(54, 162, 235, 1)',
102
+ borderWidth: 1
103
+ },
104
+ {
105
+ label: 'Token Overlap (%)',
106
+ data: tokenData,
107
+ backgroundColor: 'rgba(255, 159, 64, 0.7)',
108
+ borderColor: 'rgba(255, 159, 64, 1)',
109
+ borderWidth: 1
110
+ },
111
+ {
112
+ label: 'OUI (%)',
113
+ data: ouiData,
114
+ backgroundColor: 'rgba(153, 102, 255, 0.7)',
115
+ borderColor: 'rgba(153, 102, 255, 1)',
116
+ borderWidth: 1
117
+ }
118
+ ]
119
+ },
120
+ options: {
121
+ responsive: true,
122
+ maintainAspectRatio: false,
123
+ plugins: {
124
+ legend: {
125
+ position: 'top',
126
+ },
127
+ tooltip: {
128
+ mode: 'index',
129
+ intersect: false
130
+ }
131
+ },
132
+ scales: {
133
+ y: { beginAtZero: true },
134
+ x: {
135
+ ticks: {
136
+ autoSkip: false,
137
+ maxRotation: 45,
138
+ minRotation: 45
139
+ }
140
+ }
141
+ }
142
+ }
143
+ });
144
+ </script>
145
+ </section>
146
+ {% endif %}
147
+ </main>
148
+ <footer>
149
+ <p>&copy; 2025 NORUS Tool. All rights reserved.</p>
150
+ </footer>
151
+
152
+ <script>
153
+ document.addEventListener("DOMContentLoaded", function() {
154
+ const analysisType = document.getElementById("analysis_type");
155
+ const pubmedOptions = document.getElementById("pubmed-options");
156
+ const localOptions = document.getElementById("local-options");
157
+
158
+ function toggleOptions() {
159
+ if (analysisType.value === "pubmed") {
160
+ pubmedOptions.style.display = "block";
161
+ localOptions.style.display = "none";
162
+ } else {
163
+ pubmedOptions.style.display = "none";
164
+ localOptions.style.display = "block";
165
+ }
166
+ }
167
+
168
+ analysisType.addEventListener("change", toggleOptions);
169
+ toggleOptions();
170
+ });
171
+ </script>
172
+ </body>
173
+ </html>
templates/app.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ import pdfplumber
4
+ import numpy as np
5
+ from flask import Flask, render_template, request, redirect, url_for, flash
6
+ from werkzeug.utils import secure_filename
7
+ from sentence_transformers import SentenceTransformer, util
8
+ import nltk
9
+ from nltk.stem import WordNetLemmatizer, PorterStemmer
10
+ from nltk.tokenize import word_tokenize
11
+ from nltk.corpus import stopwords
12
+
13
+ nltk.download("punkt")
14
+ nltk.download("wordnet")
15
+ nltk.download("stopwords")
16
+
17
+ lemmatizer = WordNetLemmatizer()
18
+ stemmer = PorterStemmer()
19
+ stop_words = set(stopwords.words("english"))
20
+
21
+ app = Flask(__name__)
22
+ app.config["UPLOAD_FOLDER"] = "uploads"
23
+ os.makedirs(app.config["UPLOAD_FOLDER"], exist_ok=True)
24
+
25
+ model = SentenceTransformer("allenai/scibert_scivocab_uncased")
26
+
27
+ def extract_pdf_text(pdf_path):
28
+ text = ""
29
+ try:
30
+ with pdfplumber.open(pdf_path) as pdf:
31
+ for page in pdf.pages:
32
+ text += page.extract_text() or " "
33
+ except Exception as e:
34
+ print(f"Errore estrazione testo: {e}")
35
+ return text.lower().strip()
36
+
37
+ def preprocess_text(text):
38
+ text = text.lower()
39
+ words = word_tokenize(text)
40
+ words = [stemmer.stem(lemmatizer.lemmatize(w)) for w in words if w.isalnum() and w not in stop_words]
41
+ return " ".join(words)
42
+
43
+ def calculate_token_overlap(text1, text2):
44
+ tokens1 = set(text1.split())
45
+ tokens2 = set(text2.split())
46
+ overlap = len(tokens1 & tokens2)
47
+ return round((overlap / max(len(tokens1), 1)) * 100, 2)
48
+
49
+ def calculate_oui(similarity, token_overlap, alpha=0.7, beta=0.3):
50
+ oui = alpha * (1 - similarity / 100) + beta * (1 - token_overlap / 100)
51
+ return round(max(0, min(oui * 100, 100)), 2)
52
+
53
+ def validate_document(pdf_path, comparison_sources, method="local", titles=None):
54
+ pdf_text = extract_pdf_text(pdf_path)
55
+ results = []
56
+ for i, doc in enumerate(comparison_sources):
57
+ doc_text = extract_pdf_text(doc) if method == "local" else doc
58
+ similarity = util.pytorch_cos_sim(
59
+ model.encode(pdf_text, convert_to_tensor=True),
60
+ model.encode(doc_text, convert_to_tensor=True)
61
+ ).item() * 100
62
+ token_overlap = calculate_token_overlap(pdf_text, doc_text)
63
+ oui = calculate_oui(similarity, token_overlap)
64
+ title = titles[i] if titles and i < len(titles) else os.path.basename(doc) if method == "local" else "Unknown Title"
65
+ results.append({"title": title, "similarity": round(similarity, 2), "token_overlap": round(token_overlap, 2), "oui": round(oui, 2)})
66
+ return results
67
+
68
+ def fetch_pubmed_details(article_id):
69
+ base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
70
+ params = {"db": "pubmed", "id": article_id, "retmode": "xml"}
71
+ try:
72
+ response = requests.get(base_url, params=params)
73
+ response.raise_for_status()
74
+ import xml.etree.ElementTree as ET
75
+ root = ET.fromstring(response.text)
76
+ title = root.find(".//ArticleTitle").text if root.find(".//ArticleTitle") is not None else "No Title"
77
+ abstract = root.find(".//AbstractText").text if root.find(".//AbstractText") is not None else "No Abstract"
78
+ keywords = root.findall(".//Keyword")
79
+ keyword_text = " ".join([kw.text for kw in keywords if kw.text]) if keywords else "No Keywords"
80
+ print(f"\nπŸ” ARTICOLO RECUPERATO\nπŸ“– Titolo: {title}\nπŸ“ Abstract: {abstract[:500]}...\nπŸ”‘ Keywords: {keyword_text}\n")
81
+ return title, f"{abstract} {keyword_text}"
82
+ except requests.exceptions.RequestException as e:
83
+ print(f"Errore recupero abstract: {e}")
84
+ return "No Title", "No Abstract"
85
+
86
+ def fetch_pubmed(query, year_start, year_end, max_results=10):
87
+ base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
88
+ params = {"db": "pubmed", "term": f"{query} AND ({year_start}[PDAT] : {year_end}[PDAT])", "retmax": max_results, "retmode": "json"}
89
+ try:
90
+ response = requests.get(base_url, params=params)
91
+ response.raise_for_status()
92
+ return response.json().get("esearchresult", {}).get("idlist", [])
93
+ except requests.exceptions.RequestException as e:
94
+ print(f"Errore recupero articoli PubMed: {e}")
95
+
96
+ return []
97
+
98
+ @app.route("/")
99
+ def index():
100
+ return render_template("NORUS.html")
101
+
102
+ @app.route("/validate", methods=["POST"])
103
+ def validate():
104
+ pdf_file = request.files.get("pdf_file")
105
+ analysis_type = request.form.get("analysis_type")
106
+ local_dir = request.form.get("local_directory", "").strip()
107
+ query = request.form.get("query", "").strip()
108
+ if not pdf_file:
109
+ flash("Carica un file PDF valido.", "error")
110
+ return redirect(url_for("index"))
111
+ filename = secure_filename(pdf_file.filename)
112
+ pdf_path = os.path.join(app.config["UPLOAD_FOLDER"], filename)
113
+ pdf_file.save(pdf_path)
114
+ results = []
115
+ if analysis_type == "local":
116
+ if not os.path.isdir(local_dir):
117
+ flash("Seleziona una directory valida.", "error")
118
+ return redirect(url_for("index"))
119
+ comparison_files = [os.path.join(local_dir, f) for f in os.listdir(local_dir) if f.endswith(".pdf")]
120
+ if not comparison_files:
121
+ flash("La directory non contiene PDF.", "error")
122
+ return redirect(url_for("index"))
123
+ results = validate_document(pdf_path, comparison_files, method="local")
124
+ elif analysis_type == "pubmed":
125
+ year_start = request.form.get("year_start", "2000")
126
+ year_end = request.form.get("year_end", "2025")
127
+ num_articles = int(request.form.get("num_articles", "10"))
128
+ pubmed_ids = fetch_pubmed(query, year_start, year_end, num_articles)
129
+ pubmed_results = [fetch_pubmed_details(article_id) for article_id in pubmed_ids]
130
+ results = validate_document(pdf_path, [result[1] for result in pubmed_results], method="pubmed", titles=[result[0] for result in pubmed_results])
131
+ return render_template("NORUS.html", results=results)
132
+
133
+ if __name__ == "__main__":
134
+ app.run(debug=True, port=7860)