mabil commited on
Commit
cabbd61
·
1 Parent(s): 94bfde7

Fix: Improved OUI calculation and report in English, added article scoring, and optimized article selection

Browse files
Files changed (1) hide show
  1. app.py +59 -151
app.py CHANGED
@@ -1,14 +1,13 @@
1
  import os
2
- import re
3
  import requests
4
  import pdfplumber
5
  from flask import Flask, render_template, request, redirect, url_for, flash, send_file
6
  from werkzeug.utils import secure_filename
7
  from sentence_transformers import SentenceTransformer, util
8
  from transformers import AutoTokenizer
9
- from fpdf import FPDF
10
  from collections import Counter
11
- from io import BytesIO
12
 
13
  tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
14
 
@@ -22,20 +21,6 @@ model = SentenceTransformer("allenai/scibert_scivocab_uncased")
22
  last_results = []
23
  last_common_keywords = []
24
 
25
- # Funzione di cleaning avanzato
26
- def clean_text(text):
27
- boilerplate_phrases = [
28
- "in recent years", "this study", "data suggest that", "in conclusion",
29
- "introduction", "methods", "results", "discussion", "this review", "we aimed to",
30
- "the aim of this study", "background", "objective", "methodology", "results and discussion"
31
- ]
32
- text = text.lower()
33
- for phrase in boilerplate_phrases:
34
- text = text.replace(phrase, "")
35
- text = re.sub(r'\s+', ' ', text)
36
- return text.strip()
37
-
38
- # Estrazione testo PDF
39
  def extract_pdf_text(pdf_path):
40
  text = ""
41
  try:
@@ -44,96 +29,51 @@ def extract_pdf_text(pdf_path):
44
  text += page.extract_text() or " "
45
  except Exception as e:
46
  print(f"Errore estrazione testo: {e}")
47
- return clean_text(text)
48
-
49
- # Funzione per estrarre la sezione "Materiali e Metodi"
50
- def extract_materials_and_methods(pdf_path):
51
- text = extract_pdf_text(pdf_path)
52
- # Supponiamo che la sezione 'Materiali e Metodi' sia identificabile da uno degli headers comuni
53
- start = text.lower().find("materials and methods")
54
- if start == -1:
55
- return text # Restituisce tutto il testo se non trova la sezione
56
- end = text.lower().find("results", start)
57
- if end == -1:
58
- end = len(text) # Fino alla fine del documento se non trova la fine della sezione
59
- return text[start:end]
60
-
61
- # Preprocessing testo
62
  def preprocess_text(text):
63
  tokens = tokenizer.tokenize(text.lower())
64
  tokens = [token for token in tokens if len(token) > 3 and token.isalpha()]
65
- stopwords = set([
66
- "study", "data", "results", "analysis", "introduction", "conclusion",
67
- "method", "methods", "objective", "discussion", "the", "and", "that", "this", "from", "with", "which"
68
- ])
69
- tokens = [token for token in tokens if token not in stopwords]
70
  return tokens
71
 
72
- # Calcolo token overlap
73
  def calculate_token_overlap(text1, text2):
74
  tokens1 = set(text1.split())
75
  tokens2 = set(text2.split())
76
  overlap = len(tokens1 & tokens2)
77
  return round((overlap / max(len(tokens1), 1)) * 100, 2)
78
 
79
- # Formula OUI
80
  def calculate_oui(similarity, token_overlap, alpha=0.7, beta=0.3):
81
  oui = alpha * (1 - similarity / 100) + beta * (1 - token_overlap / 100)
82
  result = round(oui * 100, 2)
83
- if result < 0:
84
- result = 0 # Limita il valore a 0 se diventa negativo
85
- return result
86
-
87
- # Funzione di scoring degli articoli
88
- def score_article(article, pdf_text):
89
- # Calcola la similarità semantica
90
- similarity = util.pytorch_cos_sim(
91
- model.encode(pdf_text, convert_to_tensor=True),
92
- model.encode(article['text'], convert_to_tensor=True)
93
- ).item() * 100
94
-
95
- # Calcola il numero di keyword comuni
96
- tokens_pdf = preprocess_text(pdf_text)
97
- tokens_article = preprocess_text(article['text'])
98
- common_tokens = len(set(tokens_pdf) & set(tokens_article))
99
-
100
- # Punteggio complessivo (modificabile in base ai tuoi parametri)
101
- score = 0.7 * similarity + 0.3 * common_tokens
102
- return score
103
-
104
- # Validazione documento
105
- def validate_document(pdf_path, comparison_sources, method="local", titles=None, num_articles=10):
106
- pdf_text = extract_materials_and_methods(pdf_path)
107
  pdf_tokens = preprocess_text(pdf_text)
108
  results = []
109
  all_keywords = []
110
 
111
- # Calcolare il punteggio di ogni articolo
112
- scored_articles = []
113
  for i, doc in enumerate(comparison_sources):
114
- doc_text = extract_materials_and_methods(doc) if method == "local" else clean_text(doc)
115
  doc_tokens = preprocess_text(doc_text)
116
 
117
- # Assegna un punteggio a ciascun articolo
118
- article = {
119
- 'title': titles[i] if titles and i < len(titles) else os.path.basename(doc),
120
- 'text': doc_text
121
- }
122
- score = score_article(article, pdf_text)
123
- if score <= 100: # Esclude articoli con punteggi anomali
124
- scored_articles.append((score, article))
125
-
126
- # Ordina gli articoli in base al punteggio
127
- scored_articles.sort(reverse=True, key=lambda x: x[0])
128
-
129
- # Seleziona i migliori articoli in base al punteggio
130
- for i in range(min(num_articles, len(scored_articles))):
131
- article = scored_articles[i][1]
132
  results.append({
133
- "title": article['title'],
134
- "similarity": round(scored_articles[i][0], 2),
135
- "token_overlap": round(calculate_token_overlap(" ".join(pdf_tokens), " ".join(doc_tokens)), 2),
136
- "oui": calculate_oui(scored_articles[i][0], calculate_token_overlap(" ".join(pdf_tokens), " ".join(doc_tokens)))
137
  })
138
 
139
  global last_results, last_common_keywords
@@ -141,7 +81,6 @@ def validate_document(pdf_path, comparison_sources, method="local", titles=None,
141
  last_common_keywords = Counter(all_keywords).most_common(10)
142
  return results
143
 
144
- # Fetch dettagli articoli da PubMed
145
  def fetch_pubmed_details(article_id):
146
  base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
147
  params = {"db": "pubmed", "id": article_id, "retmode": "xml"}
@@ -150,62 +89,32 @@ def fetch_pubmed_details(article_id):
150
  response.raise_for_status()
151
  import xml.etree.ElementTree as ET
152
  root = ET.fromstring(response.text)
153
- title_elem = root.find(".//ArticleTitle")
154
- abstract_elem = root.find(".//AbstractText")
155
- title = title_elem.text if title_elem is not None else None
156
- abstract = abstract_elem.text if abstract_elem is not None else None
157
  keywords = root.findall(".//Keyword")
158
  keyword_text = " ".join([kw.text for kw in keywords if kw.text]) if keywords else ""
159
- if title and abstract:
160
- return title, f"{abstract} {keyword_text}"
161
- else:
162
- return None
163
  except Exception as e:
164
  print(f"Errore recupero abstract: {e}")
165
- return None
166
-
167
- # Fetch ID articoli da PubMed con modalità Re-Fill
168
- def fetch_pubmed_re_fill(query, year_start, year_end, desired_articles=10, batch_size=20):
169
- base_search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
170
- all_ids = []
171
- fetched_articles = []
172
- start = 0
173
- attempts = 0
174
- max_attempts = 5
175
-
176
- while len(fetched_articles) < desired_articles and attempts < max_attempts:
177
- params = {
178
- "db": "pubmed",
179
- "term": f"{query} AND ({year_start}[PDAT] : {year_end}[PDAT])",
180
- "retstart": start,
181
- "retmax": batch_size,
182
- "retmode": "json",
183
- "sort": "relevance"
184
- }
185
- try:
186
- response = requests.get(base_search_url, params=params)
187
- response.raise_for_status()
188
- batch_ids = response.json().get("esearchresult", {}).get("idlist", [])
189
- if not batch_ids:
190
- break
191
-
192
- all_ids.extend(batch_ids)
193
- start += batch_size
194
-
195
- for id_ in batch_ids:
196
- result = fetch_pubmed_details(id_)
197
- if result:
198
- fetched_articles.append(result)
199
- if len(fetched_articles) == desired_articles:
200
- break
201
-
202
- except Exception as e:
203
- print(f"Errore fetch batch PubMed: {e}")
204
- break
205
-
206
- attempts += 1
207
-
208
- return fetched_articles
209
 
210
  @app.route("/")
211
  def index():
@@ -242,18 +151,17 @@ def validate():
242
  year_start = request.form.get("year_start", "2000")
243
  year_end = request.form.get("year_end", "2025")
244
  num_articles = int(request.form.get("num_articles", "10"))
 
245
 
246
- pubmed_results = fetch_pubmed_re_fill(query, year_start, year_end, desired_articles=num_articles)
247
-
248
- if not pubmed_results:
249
- flash("❌ Nessun articolo PubMed valido trovato. Modifica la query o il range di anni.", "error")
250
  return redirect(url_for("index"))
251
 
252
- if len(pubmed_results) < num_articles:
253
- flash(f"⚠️ Solo {len(pubmed_results)} articoli validi trovati su {num_articles} richiesti.", "warning")
254
- flash("💡 Suggerimento: prova a rendere la query più generale o aumentare il range di anni.", "info")
255
 
256
- results = validate_document(pdf_path, [r[1] for r in pubmed_results], method="pubmed", titles=[r[0] for r in pubmed_results])
257
 
258
  return render_template("NORUS.html", results=results, keywords=last_common_keywords)
259
 
@@ -266,13 +174,13 @@ def download_report():
266
  pdf = FPDF()
267
  pdf.add_page()
268
  pdf.set_font("Arial", "B", 16)
269
- pdf.cell(0, 10, "NORUS Tool - Report Analysis", ln=True, align="C")
270
  pdf.ln(10)
271
  pdf.set_font('Arial', '', 12)
272
- pdf.multi_cell(0, 10, "OUI Index = alpha(1 - similarity/100) + beta(1 - overlap/100), with alpha = 0.7 and beta = 0.3.\nLower OUI values indicate higher semantic and textual similarity.")
273
  pdf.ln(5)
274
  pdf.set_font("Arial", "B", 12)
275
- pdf.cell(90, 10, "Title", 1)
276
  pdf.cell(30, 10, "Sim %", 1)
277
  pdf.cell(30, 10, "Overlap %", 1)
278
  pdf.cell(30, 10, "OUI", 1)
@@ -290,7 +198,7 @@ def download_report():
290
  if last_common_keywords:
291
  pdf.ln(6)
292
  pdf.set_font("Arial", "B", 12)
293
- pdf.cell(0, 10, "Common Keywords:", ln=True)
294
  pdf.set_font("Arial", "", 11)
295
  for kw, count in last_common_keywords:
296
  pdf.cell(0, 10, f"- {kw} ({count})", ln=True)
@@ -300,7 +208,7 @@ def download_report():
300
  pdf.cell(0, 10, "© 2025 NORUS Tool", 0, 0, "C")
301
 
302
  output_path = os.path.join(app.config["UPLOAD_FOLDER"], "NORUS_Report.pdf")
303
- pdf.output(output_path, 'F')
304
 
305
  return send_file(output_path, as_attachment=True)
306
 
 
1
  import os
 
2
  import requests
3
  import pdfplumber
4
  from flask import Flask, render_template, request, redirect, url_for, flash, send_file
5
  from werkzeug.utils import secure_filename
6
  from sentence_transformers import SentenceTransformer, util
7
  from transformers import AutoTokenizer
8
+ from fpdf import FPDF
9
  from collections import Counter
10
+ from io import BytesIO
11
 
12
  tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
13
 
 
21
  last_results = []
22
  last_common_keywords = []
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def extract_pdf_text(pdf_path):
25
  text = ""
26
  try:
 
29
  text += page.extract_text() or " "
30
  except Exception as e:
31
  print(f"Errore estrazione testo: {e}")
32
+ return text.lower().strip()
33
+
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  def preprocess_text(text):
35
  tokens = tokenizer.tokenize(text.lower())
36
  tokens = [token for token in tokens if len(token) > 3 and token.isalpha()]
 
 
 
 
 
37
  return tokens
38
 
 
39
  def calculate_token_overlap(text1, text2):
40
  tokens1 = set(text1.split())
41
  tokens2 = set(text2.split())
42
  overlap = len(tokens1 & tokens2)
43
  return round((overlap / max(len(tokens1), 1)) * 100, 2)
44
 
 
45
  def calculate_oui(similarity, token_overlap, alpha=0.7, beta=0.3):
46
  oui = alpha * (1 - similarity / 100) + beta * (1 - token_overlap / 100)
47
  result = round(oui * 100, 2)
48
+ return 0.0 if result == -0.0 else result
49
+
50
+ def validate_document(pdf_path, comparison_sources, method="local", titles=None):
51
+ pdf_text = extract_pdf_text(pdf_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  pdf_tokens = preprocess_text(pdf_text)
53
  results = []
54
  all_keywords = []
55
 
 
 
56
  for i, doc in enumerate(comparison_sources):
57
+ doc_text = extract_pdf_text(doc) if method == "local" else doc
58
  doc_tokens = preprocess_text(doc_text)
59
 
60
+ similarity = util.pytorch_cos_sim(
61
+ model.encode(pdf_text, convert_to_tensor=True),
62
+ model.encode(doc_text, convert_to_tensor=True)
63
+ ).item() * 100
64
+
65
+ token_overlap = calculate_token_overlap(" ".join(pdf_tokens), " ".join(doc_tokens))
66
+ oui = calculate_oui(similarity, token_overlap)
67
+ title = titles[i] if titles and i < len(titles) else os.path.basename(doc) if method == "local" else "Unknown Title"
68
+
69
+ common_keywords = list(set(pdf_tokens) & set(doc_tokens))[:5]
70
+ all_keywords.extend(common_keywords)
71
+
 
 
 
72
  results.append({
73
+ "title": title,
74
+ "similarity": round(similarity, 2),
75
+ "token_overlap": round(token_overlap, 2),
76
+ "oui": round(oui, 2)
77
  })
78
 
79
  global last_results, last_common_keywords
 
81
  last_common_keywords = Counter(all_keywords).most_common(10)
82
  return results
83
 
 
84
  def fetch_pubmed_details(article_id):
85
  base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
86
  params = {"db": "pubmed", "id": article_id, "retmode": "xml"}
 
89
  response.raise_for_status()
90
  import xml.etree.ElementTree as ET
91
  root = ET.fromstring(response.text)
92
+ title = root.find(".//ArticleTitle").text if root.find(".//ArticleTitle") is not None else "No Title"
93
+ abstract = root.find(".//AbstractText").text if root.find(".//AbstractText") is not None else "No Abstract"
 
 
94
  keywords = root.findall(".//Keyword")
95
  keyword_text = " ".join([kw.text for kw in keywords if kw.text]) if keywords else ""
96
+ return title, f"{abstract} {keyword_text}"
 
 
 
97
  except Exception as e:
98
  print(f"Errore recupero abstract: {e}")
99
+ return "No Title", "No Abstract"
100
+
101
+ def fetch_pubmed(query, year_start, year_end, max_results=10):
102
+ base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
103
+ params = {
104
+ "db": "pubmed",
105
+ "term": f"{query} AND ({year_start}[PDAT] : {year_end}[PDAT])",
106
+ "retmax": max_results,
107
+ "retmode": "json",
108
+ "sort": "relevance" # <-- Qui abbiamo ordinato per rilevanza
109
+ }
110
+ try:
111
+ response = requests.get(base_url, params=params)
112
+ response.raise_for_status()
113
+ id_list = response.json().get("esearchresult", {}).get("idlist", [])
114
+ return id_list
115
+ except Exception as e:
116
+ print(f"Errore fetch PubMed: {e}")
117
+ return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
  @app.route("/")
120
  def index():
 
151
  year_start = request.form.get("year_start", "2000")
152
  year_end = request.form.get("year_end", "2025")
153
  num_articles = int(request.form.get("num_articles", "10"))
154
+ pubmed_ids = fetch_pubmed(query, year_start, year_end, num_articles)
155
 
156
+ if not pubmed_ids:
157
+ flash("Nessun articolo trovato su PubMed per questa ricerca.", "error")
 
 
158
  return redirect(url_for("index"))
159
 
160
+ pubmed_results = [fetch_pubmed_details(id_) for id_ in pubmed_ids]
161
+ pubmed_texts = [r[1] for r in pubmed_results]
162
+ pubmed_titles = [r[0] for r in pubmed_results]
163
 
164
+ results = validate_document(pdf_path, pubmed_texts, method="pubmed", titles=pubmed_titles)
165
 
166
  return render_template("NORUS.html", results=results, keywords=last_common_keywords)
167
 
 
174
  pdf = FPDF()
175
  pdf.add_page()
176
  pdf.set_font("Arial", "B", 16)
177
+ pdf.cell(0, 10, "NORUS Tool - Report Analisi", ln=True, align="C")
178
  pdf.ln(10)
179
  pdf.set_font('Arial', '', 12)
180
+ pdf.multi_cell(0, 10, "Indice OUI = alpha(1 - sim/100) + beta(1 - overlap/100), con alpha = 0.7 e beta = 0.3.\nValori più bassi di OUI indicano maggiore similarità semantica e testuale.")
181
  pdf.ln(5)
182
  pdf.set_font("Arial", "B", 12)
183
+ pdf.cell(90, 10, "Titolo", 1)
184
  pdf.cell(30, 10, "Sim %", 1)
185
  pdf.cell(30, 10, "Overlap %", 1)
186
  pdf.cell(30, 10, "OUI", 1)
 
198
  if last_common_keywords:
199
  pdf.ln(6)
200
  pdf.set_font("Arial", "B", 12)
201
+ pdf.cell(0, 10, "Parole chiave comuni:", ln=True)
202
  pdf.set_font("Arial", "", 11)
203
  for kw, count in last_common_keywords:
204
  pdf.cell(0, 10, f"- {kw} ({count})", ln=True)
 
208
  pdf.cell(0, 10, "© 2025 NORUS Tool", 0, 0, "C")
209
 
210
  output_path = os.path.join(app.config["UPLOAD_FOLDER"], "NORUS_Report.pdf")
211
+ pdf.output(output_path, 'F')
212
 
213
  return send_file(output_path, as_attachment=True)
214