mabil commited on
Commit
1422112
·
1 Parent(s): 617eecb

Fix: Correzione complessiva

Browse files
Files changed (2) hide show
  1. app.py +21 -6
  2. static/js/script.js +8 -2
app.py CHANGED
@@ -8,10 +8,18 @@ from transformers import AutoTokenizer
8
  from fpdf import FPDF # Usa fpdf per evitare errori con unicode
9
  from collections import Counter
10
  from io import BytesIO # Importa BytesIO per generare PDF in memoria
 
 
11
 
12
  # Usa Hugging Face tokenizer
13
  tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
14
 
 
 
 
 
 
 
15
  app = Flask(__name__)
16
  app.secret_key = os.environ.get("SECRET_KEY", "NORUS_secretkey_05")
17
  app.config["UPLOAD_FOLDER"] = "uploads"
@@ -32,14 +40,20 @@ def extract_pdf_text(pdf_path):
32
  print(f"Errore estrazione testo: {e}")
33
  return text.lower().strip()
34
 
 
 
 
 
 
 
 
35
  def preprocess_text(text):
36
  # Tokenizza il testo usando il tokenizer di Hugging Face
37
  tokens = tokenizer.tokenize(text.lower())
38
 
39
- # Filtra le parole per mantenere solo quelle significative (eliminando numeri, simboli non scientifici, ecc.)
40
- tokens = [token for token in tokens if len(token) > 3 and token.isalpha()]
41
-
42
- return tokens
43
 
44
  def calculate_token_overlap(text1, text2):
45
  tokens1 = set(text1.split())
@@ -72,7 +86,8 @@ def validate_document(pdf_path, comparison_sources, method="local", titles=None)
72
  title = titles[i] if titles and i < len(titles) else os.path.basename(doc) if method == "local" else "Unknown Title"
73
 
74
  common_keywords = list(set(pdf_tokens) & set(doc_tokens))[:5]
75
- all_keywords.extend(common_keywords)
 
76
 
77
  results.append({
78
  "title": title,
@@ -212,4 +227,4 @@ def download_report():
212
  return send_file(output_path, as_attachment=True) # Forza il download del file PDF
213
 
214
  if __name__ == "__main__":
215
- app.run(debug=True, host="0.0.0.0", port=7860)
 
8
  from fpdf import FPDF # Usa fpdf per evitare errori con unicode
9
  from collections import Counter
10
  from io import BytesIO # Importa BytesIO per generare PDF in memoria
11
+ import spacy
12
+ from nltk.corpus import stopwords
13
 
14
  # Usa Hugging Face tokenizer
15
  tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
16
 
17
+ # Carica il modello di lingua di spaCy
18
+ nlp = spacy.load("en_core_web_sm")
19
+
20
+ # Lista di stopwords da rimuovere (puoi aggiungere altre parole se necessario)
21
+ stop_words = set(stopwords.words("english"))
22
+
23
  app = Flask(__name__)
24
  app.secret_key = os.environ.get("SECRET_KEY", "NORUS_secretkey_05")
25
  app.config["UPLOAD_FOLDER"] = "uploads"
 
40
  print(f"Errore estrazione testo: {e}")
41
  return text.lower().strip()
42
 
43
+ # Funzione per filtrare stopwords e nomi propri
44
+ def filter_keywords(tokens):
45
+ # Elimina i nomi propri (taggati da spaCy come 'PROPN')
46
+ doc = nlp(" ".join(tokens))
47
+ filtered_tokens = [token.text for token in doc if token.pos_ != "PROPN" and token.text not in stop_words and len(token.text) > 3]
48
+ return filtered_tokens
49
+
50
  def preprocess_text(text):
51
  # Tokenizza il testo usando il tokenizer di Hugging Face
52
  tokens = tokenizer.tokenize(text.lower())
53
 
54
+ # Filtra le parole per mantenere solo quelle significative
55
+ filtered_tokens = filter_keywords(tokens) # Filtra le parole non pertinenti
56
+ return filtered_tokens
 
57
 
58
  def calculate_token_overlap(text1, text2):
59
  tokens1 = set(text1.split())
 
86
  title = titles[i] if titles and i < len(titles) else os.path.basename(doc) if method == "local" else "Unknown Title"
87
 
88
  common_keywords = list(set(pdf_tokens) & set(doc_tokens))[:5]
89
+ filtered_keywords = filter_keywords(common_keywords) # Applica il filtro per parole scientifiche
90
+ all_keywords.extend(filtered_keywords)
91
 
92
  results.append({
93
  "title": title,
 
227
  return send_file(output_path, as_attachment=True) # Forza il download del file PDF
228
 
229
  if __name__ == "__main__":
230
+ app.run(debug=True, host="0.0.0.0", port=7860)
static/js/script.js CHANGED
@@ -21,16 +21,22 @@ document.addEventListener("DOMContentLoaded", function () {
21
  analyzeBtn.textContent = "⏳ Analisi in corso...";
22
 
23
  let width = 0;
 
 
 
24
  const interval = setInterval(() => {
25
  if (width >= 100) {
26
  clearInterval(interval);
27
  progressBar.textContent = "100%";
 
 
 
28
  } else {
29
  width += 1;
30
  progressBar.style.width = width + "%";
31
  progressBar.textContent = width + "%";
32
  }
33
- }, 500); // rallentato (tempo di aggiornamento più lungo)
34
 
35
  // fallback per riabilitare il pulsante (verrà ignorato se il server risponde prima)
36
  setTimeout(() => {
@@ -39,7 +45,7 @@ document.addEventListener("DOMContentLoaded", function () {
39
  progressContainer.style.display = "none";
40
  progressBar.style.width = "0%";
41
  progressBar.textContent = "0%";
42
- }, 10000);
43
  }
44
  }
45
 
 
21
  analyzeBtn.textContent = "⏳ Analisi in corso...";
22
 
23
  let width = 0;
24
+ const totalTime = 180000; // 3 minutes in milliseconds
25
+ const intervalTime = totalTime / 100; // Divide the time for each step (to fill the bar in 180s)
26
+
27
  const interval = setInterval(() => {
28
  if (width >= 100) {
29
  clearInterval(interval);
30
  progressBar.textContent = "100%";
31
+ setTimeout(() => {
32
+ progressContainer.style.display = "none"; // Hide the progress bar after completion
33
+ }, 1000); // Delay to allow the user to see the completion
34
  } else {
35
  width += 1;
36
  progressBar.style.width = width + "%";
37
  progressBar.textContent = width + "%";
38
  }
39
+ }, intervalTime); // Update the progress bar at the specified interval time
40
 
41
  // fallback per riabilitare il pulsante (verrà ignorato se il server risponde prima)
42
  setTimeout(() => {
 
45
  progressContainer.style.display = "none";
46
  progressBar.style.width = "0%";
47
  progressBar.textContent = "0%";
48
+ }, totalTime + 2000); // Timeout after total time plus extra 2 seconds to hide progress
49
  }
50
  }
51