Spaces:
Sleeping
Sleeping
Fix: Correzione complessiva
Browse files- app.py +21 -6
- static/js/script.js +8 -2
app.py
CHANGED
@@ -8,10 +8,18 @@ from transformers import AutoTokenizer
|
|
8 |
from fpdf import FPDF # Usa fpdf per evitare errori con unicode
|
9 |
from collections import Counter
|
10 |
from io import BytesIO # Importa BytesIO per generare PDF in memoria
|
|
|
|
|
11 |
|
12 |
# Usa Hugging Face tokenizer
|
13 |
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
app = Flask(__name__)
|
16 |
app.secret_key = os.environ.get("SECRET_KEY", "NORUS_secretkey_05")
|
17 |
app.config["UPLOAD_FOLDER"] = "uploads"
|
@@ -32,14 +40,20 @@ def extract_pdf_text(pdf_path):
|
|
32 |
print(f"Errore estrazione testo: {e}")
|
33 |
return text.lower().strip()
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
def preprocess_text(text):
|
36 |
# Tokenizza il testo usando il tokenizer di Hugging Face
|
37 |
tokens = tokenizer.tokenize(text.lower())
|
38 |
|
39 |
-
# Filtra le parole per mantenere solo quelle significative
|
40 |
-
|
41 |
-
|
42 |
-
return tokens
|
43 |
|
44 |
def calculate_token_overlap(text1, text2):
|
45 |
tokens1 = set(text1.split())
|
@@ -72,7 +86,8 @@ def validate_document(pdf_path, comparison_sources, method="local", titles=None)
|
|
72 |
title = titles[i] if titles and i < len(titles) else os.path.basename(doc) if method == "local" else "Unknown Title"
|
73 |
|
74 |
common_keywords = list(set(pdf_tokens) & set(doc_tokens))[:5]
|
75 |
-
|
|
|
76 |
|
77 |
results.append({
|
78 |
"title": title,
|
@@ -212,4 +227,4 @@ def download_report():
|
|
212 |
return send_file(output_path, as_attachment=True) # Forza il download del file PDF
|
213 |
|
214 |
if __name__ == "__main__":
|
215 |
-
app.run(debug=True, host="0.0.0.0", port=7860)
|
|
|
8 |
from fpdf import FPDF # Usa fpdf per evitare errori con unicode
|
9 |
from collections import Counter
|
10 |
from io import BytesIO # Importa BytesIO per generare PDF in memoria
|
11 |
+
import spacy
|
12 |
+
from nltk.corpus import stopwords
|
13 |
|
14 |
# Usa Hugging Face tokenizer
|
15 |
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
16 |
|
17 |
+
# Carica il modello di lingua di spaCy
|
18 |
+
nlp = spacy.load("en_core_web_sm")
|
19 |
+
|
20 |
+
# Lista di stopwords da rimuovere (puoi aggiungere altre parole se necessario)
|
21 |
+
stop_words = set(stopwords.words("english"))
|
22 |
+
|
23 |
app = Flask(__name__)
|
24 |
app.secret_key = os.environ.get("SECRET_KEY", "NORUS_secretkey_05")
|
25 |
app.config["UPLOAD_FOLDER"] = "uploads"
|
|
|
40 |
print(f"Errore estrazione testo: {e}")
|
41 |
return text.lower().strip()
|
42 |
|
43 |
+
# Funzione per filtrare stopwords e nomi propri
|
44 |
+
def filter_keywords(tokens):
|
45 |
+
# Elimina i nomi propri (taggati da spaCy come 'PROPN')
|
46 |
+
doc = nlp(" ".join(tokens))
|
47 |
+
filtered_tokens = [token.text for token in doc if token.pos_ != "PROPN" and token.text not in stop_words and len(token.text) > 3]
|
48 |
+
return filtered_tokens
|
49 |
+
|
50 |
def preprocess_text(text):
|
51 |
# Tokenizza il testo usando il tokenizer di Hugging Face
|
52 |
tokens = tokenizer.tokenize(text.lower())
|
53 |
|
54 |
+
# Filtra le parole per mantenere solo quelle significative
|
55 |
+
filtered_tokens = filter_keywords(tokens) # Filtra le parole non pertinenti
|
56 |
+
return filtered_tokens
|
|
|
57 |
|
58 |
def calculate_token_overlap(text1, text2):
|
59 |
tokens1 = set(text1.split())
|
|
|
86 |
title = titles[i] if titles and i < len(titles) else os.path.basename(doc) if method == "local" else "Unknown Title"
|
87 |
|
88 |
common_keywords = list(set(pdf_tokens) & set(doc_tokens))[:5]
|
89 |
+
filtered_keywords = filter_keywords(common_keywords) # Applica il filtro per parole scientifiche
|
90 |
+
all_keywords.extend(filtered_keywords)
|
91 |
|
92 |
results.append({
|
93 |
"title": title,
|
|
|
227 |
return send_file(output_path, as_attachment=True) # Forza il download del file PDF
|
228 |
|
229 |
if __name__ == "__main__":
|
230 |
+
app.run(debug=True, host="0.0.0.0", port=7860)
|
static/js/script.js
CHANGED
@@ -21,16 +21,22 @@ document.addEventListener("DOMContentLoaded", function () {
|
|
21 |
analyzeBtn.textContent = "⏳ Analisi in corso...";
|
22 |
|
23 |
let width = 0;
|
|
|
|
|
|
|
24 |
const interval = setInterval(() => {
|
25 |
if (width >= 100) {
|
26 |
clearInterval(interval);
|
27 |
progressBar.textContent = "100%";
|
|
|
|
|
|
|
28 |
} else {
|
29 |
width += 1;
|
30 |
progressBar.style.width = width + "%";
|
31 |
progressBar.textContent = width + "%";
|
32 |
}
|
33 |
-
},
|
34 |
|
35 |
// fallback per riabilitare il pulsante (verrà ignorato se il server risponde prima)
|
36 |
setTimeout(() => {
|
@@ -39,7 +45,7 @@ document.addEventListener("DOMContentLoaded", function () {
|
|
39 |
progressContainer.style.display = "none";
|
40 |
progressBar.style.width = "0%";
|
41 |
progressBar.textContent = "0%";
|
42 |
-
},
|
43 |
}
|
44 |
}
|
45 |
|
|
|
21 |
analyzeBtn.textContent = "⏳ Analisi in corso...";
|
22 |
|
23 |
let width = 0;
|
24 |
+
const totalTime = 180000; // 3 minutes in milliseconds
|
25 |
+
const intervalTime = totalTime / 100; // Divide the time for each step (to fill the bar in 180s)
|
26 |
+
|
27 |
const interval = setInterval(() => {
|
28 |
if (width >= 100) {
|
29 |
clearInterval(interval);
|
30 |
progressBar.textContent = "100%";
|
31 |
+
setTimeout(() => {
|
32 |
+
progressContainer.style.display = "none"; // Hide the progress bar after completion
|
33 |
+
}, 1000); // Delay to allow the user to see the completion
|
34 |
} else {
|
35 |
width += 1;
|
36 |
progressBar.style.width = width + "%";
|
37 |
progressBar.textContent = width + "%";
|
38 |
}
|
39 |
+
}, intervalTime); // Update the progress bar at the specified interval time
|
40 |
|
41 |
// fallback per riabilitare il pulsante (verrà ignorato se il server risponde prima)
|
42 |
setTimeout(() => {
|
|
|
45 |
progressContainer.style.display = "none";
|
46 |
progressBar.style.width = "0%";
|
47 |
progressBar.textContent = "0%";
|
48 |
+
}, totalTime + 2000); // Timeout after total time plus extra 2 seconds to hide progress
|
49 |
}
|
50 |
}
|
51 |
|