Spaces:

mabil
/

norus-tool

Sleeping

App Files Files Community

mabil commited on Mar 27

Commit

1422112

1 Parent(s): 617eecb

Fix: Correzione complessiva

Browse files

Files changed (2) hide show

app.py +21 -6
static/js/script.js +8 -2

app.py CHANGED Viewed

@@ -8,10 +8,18 @@ from transformers import AutoTokenizer
 from fpdf import FPDF  # Usa fpdf per evitare errori con unicode
 from collections import Counter
 from io import BytesIO  # Importa BytesIO per generare PDF in memoria
 # Usa Hugging Face tokenizer
 tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
 app = Flask(__name__)
 app.secret_key = os.environ.get("SECRET_KEY", "NORUS_secretkey_05")
 app.config["UPLOAD_FOLDER"] = "uploads"
@@ -32,14 +40,20 @@ def extract_pdf_text(pdf_path):
         print(f"Errore estrazione testo: {e}")
     return text.lower().strip()
 def preprocess_text(text):
     # Tokenizza il testo usando il tokenizer di Hugging Face
     tokens = tokenizer.tokenize(text.lower())
-    # Filtra le parole per mantenere solo quelle significative (eliminando numeri, simboli non scientifici, ecc.)
-    tokens = [token for token in tokens if len(token) > 3 and token.isalpha()]
-    return tokens
 def calculate_token_overlap(text1, text2):
     tokens1 = set(text1.split())
@@ -72,7 +86,8 @@ def validate_document(pdf_path, comparison_sources, method="local", titles=None)
         title = titles[i] if titles and i < len(titles) else os.path.basename(doc) if method == "local" else "Unknown Title"
         common_keywords = list(set(pdf_tokens) & set(doc_tokens))[:5]
-        all_keywords.extend(common_keywords)
         results.append({
             "title": title,
@@ -212,4 +227,4 @@ def download_report():
     return send_file(output_path, as_attachment=True)  # Forza il download del file PDF
 if __name__ == "__main__":
-    app.run(debug=True, host="0.0.0.0", port=7860)

 from fpdf import FPDF  # Usa fpdf per evitare errori con unicode
 from collections import Counter
 from io import BytesIO  # Importa BytesIO per generare PDF in memoria
+import spacy
+from nltk.corpus import stopwords
 # Usa Hugging Face tokenizer
 tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
+# Carica il modello di lingua di spaCy
+nlp = spacy.load("en_core_web_sm")
+# Lista di stopwords da rimuovere (puoi aggiungere altre parole se necessario)
+stop_words = set(stopwords.words("english"))
 app = Flask(__name__)
 app.secret_key = os.environ.get("SECRET_KEY", "NORUS_secretkey_05")
 app.config["UPLOAD_FOLDER"] = "uploads"
         print(f"Errore estrazione testo: {e}")
     return text.lower().strip()
+# Funzione per filtrare stopwords e nomi propri
+def filter_keywords(tokens):
+    # Elimina i nomi propri (taggati da spaCy come 'PROPN')
+    doc = nlp(" ".join(tokens))
+    filtered_tokens = [token.text for token in doc if token.pos_ != "PROPN" and token.text not in stop_words and len(token.text) > 3]
+    return filtered_tokens
 def preprocess_text(text):
     # Tokenizza il testo usando il tokenizer di Hugging Face
     tokens = tokenizer.tokenize(text.lower())
+    # Filtra le parole per mantenere solo quelle significative
+    filtered_tokens = filter_keywords(tokens)  # Filtra le parole non pertinenti
+    return filtered_tokens
 def calculate_token_overlap(text1, text2):
     tokens1 = set(text1.split())
         title = titles[i] if titles and i < len(titles) else os.path.basename(doc) if method == "local" else "Unknown Title"
         common_keywords = list(set(pdf_tokens) & set(doc_tokens))[:5]
+        filtered_keywords = filter_keywords(common_keywords)  # Applica il filtro per parole scientifiche
+        all_keywords.extend(filtered_keywords)
         results.append({
             "title": title,
     return send_file(output_path, as_attachment=True)  # Forza il download del file PDF
 if __name__ == "__main__":
+    app.run(debug=True, host="0.0.0.0", port=7860)

static/js/script.js CHANGED Viewed

@@ -21,16 +21,22 @@ document.addEventListener("DOMContentLoaded", function () {
             analyzeBtn.textContent = "⏳ Analisi in corso...";
             let width = 0;
             const interval = setInterval(() => {
                 if (width >= 100) {
                     clearInterval(interval);
                     progressBar.textContent = "100%";
                 } else {
                     width += 1;
                     progressBar.style.width = width + "%";
                     progressBar.textContent = width + "%";
                 }
-            }, 500); // rallentato (tempo di aggiornamento più lungo)
             // fallback per riabilitare il pulsante (verrà ignorato se il server risponde prima)
             setTimeout(() => {
@@ -39,7 +45,7 @@ document.addEventListener("DOMContentLoaded", function () {
                 progressContainer.style.display = "none";
                 progressBar.style.width = "0%";
                 progressBar.textContent = "0%";
-            }, 10000);
         }
     }

             analyzeBtn.textContent = "⏳ Analisi in corso...";
             let width = 0;
+            const totalTime = 180000;  // 3 minutes in milliseconds
+            const intervalTime = totalTime / 100;  // Divide the time for each step (to fill the bar in 180s)
             const interval = setInterval(() => {
                 if (width >= 100) {
                     clearInterval(interval);
                     progressBar.textContent = "100%";
+                    setTimeout(() => {
+                        progressContainer.style.display = "none";  // Hide the progress bar after completion
+                    }, 1000); // Delay to allow the user to see the completion
                 } else {
                     width += 1;
                     progressBar.style.width = width + "%";
                     progressBar.textContent = width + "%";
                 }
+            }, intervalTime); // Update the progress bar at the specified interval time
             // fallback per riabilitare il pulsante (verrà ignorato se il server risponde prima)
             setTimeout(() => {
                 progressContainer.style.display = "none";
                 progressBar.style.width = "0%";
                 progressBar.textContent = "0%";
+            }, totalTime + 2000); // Timeout after total time plus extra 2 seconds to hide progress
         }
     }