mabil commited on
Commit
300a457
·
1 Parent(s): db5493b

Major improvement: semantic and token cleaning

Browse files
Files changed (1) hide show
  1. app.py +54 -19
app.py CHANGED
@@ -1,13 +1,14 @@
1
  import os
 
2
  import requests
3
  import pdfplumber
4
  from flask import Flask, render_template, request, redirect, url_for, flash, send_file
5
  from werkzeug.utils import secure_filename
6
  from sentence_transformers import SentenceTransformer, util
7
  from transformers import AutoTokenizer
8
- from fpdf import FPDF
9
  from collections import Counter
10
- from io import BytesIO
11
 
12
  tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
13
 
@@ -21,6 +22,20 @@ model = SentenceTransformer("allenai/scibert_scivocab_uncased")
21
  last_results = []
22
  last_common_keywords = []
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def extract_pdf_text(pdf_path):
25
  text = ""
26
  try:
@@ -29,24 +44,34 @@ def extract_pdf_text(pdf_path):
29
  text += page.extract_text() or " "
30
  except Exception as e:
31
  print(f"Errore estrazione testo: {e}")
32
- return text.lower().strip()
33
 
 
34
  def preprocess_text(text):
35
  tokens = tokenizer.tokenize(text.lower())
36
  tokens = [token for token in tokens if len(token) > 3 and token.isalpha()]
 
 
 
 
 
 
37
  return tokens
38
 
 
39
  def calculate_token_overlap(text1, text2):
40
  tokens1 = set(text1.split())
41
  tokens2 = set(text2.split())
42
  overlap = len(tokens1 & tokens2)
43
  return round((overlap / max(len(tokens1), 1)) * 100, 2)
44
 
 
45
  def calculate_oui(similarity, token_overlap, alpha=0.7, beta=0.3):
46
  oui = alpha * (1 - similarity / 100) + beta * (1 - token_overlap / 100)
47
  result = round(oui * 100, 2)
48
  return 0.0 if result == -0.0 else result
49
 
 
50
  def validate_document(pdf_path, comparison_sources, method="local", titles=None):
51
  pdf_text = extract_pdf_text(pdf_path)
52
  pdf_tokens = preprocess_text(pdf_text)
@@ -54,7 +79,7 @@ def validate_document(pdf_path, comparison_sources, method="local", titles=None)
54
  all_keywords = []
55
 
56
  for i, doc in enumerate(comparison_sources):
57
- doc_text = extract_pdf_text(doc) if method == "local" else doc
58
  doc_tokens = preprocess_text(doc_text)
59
 
60
  similarity = util.pytorch_cos_sim(
@@ -81,6 +106,7 @@ def validate_document(pdf_path, comparison_sources, method="local", titles=None)
81
  last_common_keywords = Counter(all_keywords).most_common(10)
82
  return results
83
 
 
84
  def fetch_pubmed_details(article_id):
85
  base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
86
  params = {"db": "pubmed", "id": article_id, "retmode": "xml"}
@@ -89,23 +115,21 @@ def fetch_pubmed_details(article_id):
89
  response.raise_for_status()
90
  import xml.etree.ElementTree as ET
91
  root = ET.fromstring(response.text)
92
- title_element = root.find(".//ArticleTitle")
93
- abstract_element = root.find(".//AbstractText")
94
-
95
- title = title_element.text.strip() if title_element is not None and title_element.text else "No Title"
96
- abstract = abstract_element.text.strip() if abstract_element is not None and abstract_element.text else "No Abstract"
97
-
98
  keywords = root.findall(".//Keyword")
99
  keyword_text = " ".join([kw.text for kw in keywords if kw.text]) if keywords else ""
100
-
101
- if title == "No Title" or abstract == "No Abstract":
102
- return None # Se titolo o abstract mancano, scarta questo articolo
103
-
104
- return title, f"{abstract} {keyword_text}"
105
  except Exception as e:
106
  print(f"Errore recupero abstract: {e}")
107
  return None
108
 
 
109
  def fetch_pubmed(query, year_start, year_end, max_results=10):
110
  base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
111
  params = {
@@ -113,7 +137,7 @@ def fetch_pubmed(query, year_start, year_end, max_results=10):
113
  "term": f"{query} AND ({year_start}[PDAT] : {year_end}[PDAT])",
114
  "retmax": max_results,
115
  "retmode": "json",
116
- "sort": "relevance"
117
  }
118
  try:
119
  response = requests.get(base_url, params=params)
@@ -159,12 +183,23 @@ def validate():
159
  year_end = request.form.get("year_end", "2025")
160
  num_articles = int(request.form.get("num_articles", "10"))
161
  pubmed_ids = fetch_pubmed(query, year_start, year_end, num_articles)
 
 
 
 
 
162
  pubmed_results = [fetch_pubmed_details(id_) for id_ in pubmed_ids]
 
 
163
  pubmed_results = [r for r in pubmed_results if r is not None]
164
 
 
 
165
  if not pubmed_results:
166
  flash("Nessun articolo PubMed valido trovato. Modifica la query o il range di anni.", "error")
167
  return redirect(url_for("index"))
 
 
168
 
169
  results = validate_document(pdf_path, [r[1] for r in pubmed_results], method="pubmed", titles=[r[0] for r in pubmed_results])
170
 
@@ -213,9 +248,9 @@ def download_report():
213
  pdf.cell(0, 10, "© 2025 NORUS Tool", 0, 0, "C")
214
 
215
  output_path = os.path.join(app.config["UPLOAD_FOLDER"], "NORUS_Report.pdf")
216
- pdf.output(output_path, 'F')
217
 
218
- return send_file(output_path, as_attachment=True)
219
 
220
  if __name__ == "__main__":
221
- app.run(debug=True, host="0.0.0.0", port=7860)
 
1
  import os
2
+ import re
3
  import requests
4
  import pdfplumber
5
  from flask import Flask, render_template, request, redirect, url_for, flash, send_file
6
  from werkzeug.utils import secure_filename
7
  from sentence_transformers import SentenceTransformer, util
8
  from transformers import AutoTokenizer
9
+ from fpdf import FPDF
10
  from collections import Counter
11
+ from io import BytesIO
12
 
13
  tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
14
 
 
22
  last_results = []
23
  last_common_keywords = []
24
 
25
+ # Funzione di cleaning avanzato
26
+ def clean_text(text):
27
+ boilerplate_phrases = [
28
+ "in recent years", "this study", "data suggest that", "in conclusion",
29
+ "introduction", "methods", "results", "discussion", "this review", "we aimed to",
30
+ "the aim of this study", "background", "objective", "methodology", "results and discussion"
31
+ ]
32
+ text = text.lower()
33
+ for phrase in boilerplate_phrases:
34
+ text = text.replace(phrase, "")
35
+ text = re.sub(r'\s+', ' ', text) # Remove extra spaces
36
+ return text.strip()
37
+
38
+ # Funzione di estrazione testo PDF
39
  def extract_pdf_text(pdf_path):
40
  text = ""
41
  try:
 
44
  text += page.extract_text() or " "
45
  except Exception as e:
46
  print(f"Errore estrazione testo: {e}")
47
+ return clean_text(text)
48
 
49
+ # Preprocesso dei token
50
  def preprocess_text(text):
51
  tokens = tokenizer.tokenize(text.lower())
52
  tokens = [token for token in tokens if len(token) > 3 and token.isalpha()]
53
+ # Lista minimale di stopwords scientifiche comuni
54
+ stopwords = set([
55
+ "study", "data", "results", "analysis", "introduction", "conclusion",
56
+ "method", "methods", "objective", "discussion", "the", "and", "that", "this", "from", "with", "which"
57
+ ])
58
+ tokens = [token for token in tokens if token not in stopwords]
59
  return tokens
60
 
61
+ # Calcolo token overlap migliorato
62
  def calculate_token_overlap(text1, text2):
63
  tokens1 = set(text1.split())
64
  tokens2 = set(text2.split())
65
  overlap = len(tokens1 & tokens2)
66
  return round((overlap / max(len(tokens1), 1)) * 100, 2)
67
 
68
+ # Formula OUI aggiornata
69
  def calculate_oui(similarity, token_overlap, alpha=0.7, beta=0.3):
70
  oui = alpha * (1 - similarity / 100) + beta * (1 - token_overlap / 100)
71
  result = round(oui * 100, 2)
72
  return 0.0 if result == -0.0 else result
73
 
74
+ # Validazione documento
75
  def validate_document(pdf_path, comparison_sources, method="local", titles=None):
76
  pdf_text = extract_pdf_text(pdf_path)
77
  pdf_tokens = preprocess_text(pdf_text)
 
79
  all_keywords = []
80
 
81
  for i, doc in enumerate(comparison_sources):
82
+ doc_text = extract_pdf_text(doc) if method == "local" else clean_text(doc)
83
  doc_tokens = preprocess_text(doc_text)
84
 
85
  similarity = util.pytorch_cos_sim(
 
106
  last_common_keywords = Counter(all_keywords).most_common(10)
107
  return results
108
 
109
+ # Fetch dettagli articoli da PubMed
110
  def fetch_pubmed_details(article_id):
111
  base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
112
  params = {"db": "pubmed", "id": article_id, "retmode": "xml"}
 
115
  response.raise_for_status()
116
  import xml.etree.ElementTree as ET
117
  root = ET.fromstring(response.text)
118
+ title_elem = root.find(".//ArticleTitle")
119
+ abstract_elem = root.find(".//AbstractText")
120
+ title = title_elem.text if title_elem is not None else None
121
+ abstract = abstract_elem.text if abstract_elem is not None else None
 
 
122
  keywords = root.findall(".//Keyword")
123
  keyword_text = " ".join([kw.text for kw in keywords if kw.text]) if keywords else ""
124
+ if title and abstract:
125
+ return title, f"{abstract} {keyword_text}"
126
+ else:
127
+ return None
 
128
  except Exception as e:
129
  print(f"Errore recupero abstract: {e}")
130
  return None
131
 
132
+ # Fetch ID articoli da PubMed
133
  def fetch_pubmed(query, year_start, year_end, max_results=10):
134
  base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
135
  params = {
 
137
  "term": f"{query} AND ({year_start}[PDAT] : {year_end}[PDAT])",
138
  "retmax": max_results,
139
  "retmode": "json",
140
+ "sort": "relevance" # Importantissimo: ordina per rilevanza
141
  }
142
  try:
143
  response = requests.get(base_url, params=params)
 
183
  year_end = request.form.get("year_end", "2025")
184
  num_articles = int(request.form.get("num_articles", "10"))
185
  pubmed_ids = fetch_pubmed(query, year_start, year_end, num_articles)
186
+
187
+ if not pubmed_ids:
188
+ flash("Nessun articolo trovato su PubMed. Modifica la query o il range di anni.", "error")
189
+ return redirect(url_for("index"))
190
+
191
  pubmed_results = [fetch_pubmed_details(id_) for id_ in pubmed_ids]
192
+ total_articles = len(pubmed_ids)
193
+ valid_articles = len([r for r in pubmed_results if r is not None])
194
  pubmed_results = [r for r in pubmed_results if r is not None]
195
 
196
+ print(f"Trovati {total_articles} articoli da PubMed. Validi dopo controllo: {valid_articles} articoli.")
197
+
198
  if not pubmed_results:
199
  flash("Nessun articolo PubMed valido trovato. Modifica la query o il range di anni.", "error")
200
  return redirect(url_for("index"))
201
+ elif valid_articles < total_articles:
202
+ flash(f"⚠️ Trovati solo {valid_articles} articoli validi su {total_articles} richiesti.", "warning")
203
 
204
  results = validate_document(pdf_path, [r[1] for r in pubmed_results], method="pubmed", titles=[r[0] for r in pubmed_results])
205
 
 
248
  pdf.cell(0, 10, "© 2025 NORUS Tool", 0, 0, "C")
249
 
250
  output_path = os.path.join(app.config["UPLOAD_FOLDER"], "NORUS_Report.pdf")
251
+ pdf.output(output_path, 'F')
252
 
253
+ return send_file(output_path, as_attachment=True)
254
 
255
  if __name__ == "__main__":
256
+ app.run(debug=True, host="0.0.0.0", port=7860)