mabil commited on
Commit
088b011
·
1 Parent(s): 8b3c01e

Fix: Improved OUI calculation and report in English, added article scoring, and optimized article selection

Browse files
Files changed (1) hide show
  1. app.py +48 -39
app.py CHANGED
@@ -47,40 +47,6 @@ def calculate_oui(similarity, token_overlap, alpha=0.7, beta=0.3):
47
  result = round(oui * 100, 2)
48
  return 0.0 if result == -0.0 else result
49
 
50
- def validate_document(pdf_path, comparison_sources, method="local", titles=None):
51
- pdf_text = extract_pdf_text(pdf_path)
52
- pdf_tokens = preprocess_text(pdf_text)
53
- results = []
54
- all_keywords = []
55
-
56
- for i, doc in enumerate(comparison_sources):
57
- doc_text = extract_pdf_text(doc) if method == "local" else doc
58
- doc_tokens = preprocess_text(doc_text)
59
-
60
- similarity = util.pytorch_cos_sim(
61
- model.encode(pdf_text, convert_to_tensor=True),
62
- model.encode(doc_text, convert_to_tensor=True)
63
- ).item() * 100
64
-
65
- token_overlap = calculate_token_overlap(" ".join(pdf_tokens), " ".join(doc_tokens))
66
- oui = calculate_oui(similarity, token_overlap)
67
- title = titles[i] if titles and i < len(titles) else os.path.basename(doc) if method == "local" else "Unknown Title"
68
-
69
- common_keywords = list(set(pdf_tokens) & set(doc_tokens))[:5]
70
- all_keywords.extend(common_keywords)
71
-
72
- results.append({
73
- "title": title,
74
- "similarity": round(similarity, 2),
75
- "token_overlap": round(token_overlap, 2),
76
- "oui": round(oui, 2)
77
- })
78
-
79
- global last_results, last_common_keywords
80
- last_results = results
81
- last_common_keywords = Counter(all_keywords).most_common(10)
82
- return results
83
-
84
  def fetch_pubmed_details(article_id):
85
  base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
86
  params = {"db": "pubmed", "id": article_id, "retmode": "xml"}
@@ -97,9 +63,9 @@ def fetch_pubmed_details(article_id):
97
  title = title_elem.text if title_elem is not None else None
98
  abstract = abstract_elem.text if abstract_elem is not None else None
99
 
 
100
  if not title or not abstract:
101
- # Se manca titolo o abstract, restituisci un messaggio chiaro
102
- return "No Title", "No Abstract"
103
 
104
  # Recupero delle parole chiave
105
  keywords = root.findall(".//Keyword")
@@ -108,7 +74,7 @@ def fetch_pubmed_details(article_id):
108
  return title, f"{abstract} {keyword_text}"
109
  except Exception as e:
110
  print(f"Errore recupero abstract: {e}")
111
- return "No Title", "No Abstract"
112
 
113
  def fetch_pubmed(query, year_start, year_end, max_results=10):
114
  base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
@@ -128,6 +94,43 @@ def fetch_pubmed(query, year_start, year_end, max_results=10):
128
  print(f"Errore fetch PubMed: {e}")
129
  return []
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  @app.route("/")
132
  def index():
133
  return render_template("NORUS.html")
@@ -166,10 +169,16 @@ def validate():
166
  pubmed_ids = fetch_pubmed(query, year_start, year_end, num_articles)
167
 
168
  if not pubmed_ids:
169
- flash("Nessun articolo trovato su PubMed per questa ricerca.", "error")
170
  return redirect(url_for("index"))
171
 
172
  pubmed_results = [fetch_pubmed_details(id_) for id_ in pubmed_ids]
 
 
 
 
 
 
173
  pubmed_texts = [r[1] for r in pubmed_results]
174
  pubmed_titles = [r[0] for r in pubmed_results]
175
 
@@ -225,4 +234,4 @@ def download_report():
225
  return send_file(output_path, as_attachment=True)
226
 
227
  if __name__ == "__main__":
228
- app.run(debug=True, host="0.0.0.0", port=7860)
 
47
  result = round(oui * 100, 2)
48
  return 0.0 if result == -0.0 else result
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  def fetch_pubmed_details(article_id):
51
  base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
52
  params = {"db": "pubmed", "id": article_id, "retmode": "xml"}
 
63
  title = title_elem.text if title_elem is not None else None
64
  abstract = abstract_elem.text if abstract_elem is not None else None
65
 
66
+ # Se manca titolo o abstract, ignoriamo questo articolo
67
  if not title or not abstract:
68
+ return None # Restituisce None se non ci sono titolo o abstract
 
69
 
70
  # Recupero delle parole chiave
71
  keywords = root.findall(".//Keyword")
 
74
  return title, f"{abstract} {keyword_text}"
75
  except Exception as e:
76
  print(f"Errore recupero abstract: {e}")
77
+ return None # Restituisce None se c'è un errore nella richiesta
78
 
79
  def fetch_pubmed(query, year_start, year_end, max_results=10):
80
  base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
 
94
  print(f"Errore fetch PubMed: {e}")
95
  return []
96
 
97
+ def validate_document(pdf_path, comparison_sources, method="local", titles=None):
98
+ pdf_text = extract_pdf_text(pdf_path)
99
+ pdf_tokens = preprocess_text(pdf_text)
100
+ results = []
101
+ all_keywords = []
102
+
103
+ for i, doc in enumerate(comparison_sources):
104
+ if not doc: # Salta gli articoli invalidi
105
+ continue
106
+
107
+ doc_text = extract_pdf_text(doc) if method == "local" else doc
108
+ doc_tokens = preprocess_text(doc_text)
109
+
110
+ similarity = util.pytorch_cos_sim(
111
+ model.encode(pdf_text, convert_to_tensor=True),
112
+ model.encode(doc_text, convert_to_tensor=True)
113
+ ).item() * 100
114
+
115
+ token_overlap = calculate_token_overlap(" ".join(pdf_tokens), " ".join(doc_tokens))
116
+ oui = calculate_oui(similarity, token_overlap)
117
+ title = titles[i] if titles and i < len(titles) else os.path.basename(doc) if method == "local" else "Unknown Title"
118
+
119
+ common_keywords = list(set(pdf_tokens) & set(doc_tokens))[:5]
120
+ all_keywords.extend(common_keywords)
121
+
122
+ results.append({
123
+ "title": title,
124
+ "similarity": round(similarity, 2),
125
+ "token_overlap": round(token_overlap, 2),
126
+ "oui": round(oui, 2)
127
+ })
128
+
129
+ global last_results, last_common_keywords
130
+ last_results = results
131
+ last_common_keywords = Counter(all_keywords).most_common(10)
132
+ return results
133
+
134
  @app.route("/")
135
  def index():
136
  return render_template("NORUS.html")
 
169
  pubmed_ids = fetch_pubmed(query, year_start, year_end, num_articles)
170
 
171
  if not pubmed_ids:
172
+ flash("Nessun articolo trovato su PubMed per questa ricerca.", "error")
173
  return redirect(url_for("index"))
174
 
175
  pubmed_results = [fetch_pubmed_details(id_) for id_ in pubmed_ids]
176
+ # Filtrare gli articoli che non hanno titolo o abstract
177
+ pubmed_results = [result for result in pubmed_results if result is not None]
178
+ if not pubmed_results:
179
+ flash("Nessun articolo valido trovato da PubMed.", "error")
180
+ return redirect(url_for("index"))
181
+
182
  pubmed_texts = [r[1] for r in pubmed_results]
183
  pubmed_titles = [r[0] for r in pubmed_results]
184
 
 
234
  return send_file(output_path, as_attachment=True)
235
 
236
  if __name__ == "__main__":
237
+ app.run(debug=True, host="0.0.0.0", port=7860)