Spaces:
Sleeping
Sleeping
Fix: Replaced NLTK tokenizer with Hugging Face AutoTokenizer
Browse files
app.py
CHANGED
@@ -4,23 +4,12 @@ import pdfplumber
|
|
4 |
from flask import Flask, render_template, request, redirect, url_for, flash, send_file
|
5 |
from werkzeug.utils import secure_filename
|
6 |
from sentence_transformers import SentenceTransformer, util
|
7 |
-
import
|
8 |
-
from nltk.stem import WordNetLemmatizer, PorterStemmer
|
9 |
-
from nltk.tokenize import word_tokenize
|
10 |
-
from nltk.corpus import stopwords
|
11 |
from fpdf import FPDF
|
12 |
from collections import Counter
|
13 |
|
14 |
-
#
|
15 |
-
|
16 |
-
nltk.download('wordnet', download_dir='/home/user/nltk_data')
|
17 |
-
nltk.download('stopwords', download_dir='/home/user/nltk_data')
|
18 |
-
|
19 |
-
nltk.data.path.append("/home/user/nltk_data")
|
20 |
-
|
21 |
-
lemmatizer = WordNetLemmatizer()
|
22 |
-
stemmer = PorterStemmer()
|
23 |
-
stop_words = set(stopwords.words("english"))
|
24 |
|
25 |
app = Flask(__name__)
|
26 |
app.secret_key = os.environ.get("SECRET_KEY", "NORUS_secretkey_05")
|
@@ -43,9 +32,9 @@ def extract_pdf_text(pdf_path):
|
|
43 |
return text.lower().strip()
|
44 |
|
45 |
def preprocess_text(text):
|
46 |
-
|
47 |
-
|
48 |
-
return
|
49 |
|
50 |
def calculate_token_overlap(text1, text2):
|
51 |
tokens1 = set(text1.split())
|
|
|
4 |
from flask import Flask, render_template, request, redirect, url_for, flash, send_file
|
5 |
from werkzeug.utils import secure_filename
|
6 |
from sentence_transformers import SentenceTransformer, util
|
7 |
+
from transformers import AutoTokenizer
|
|
|
|
|
|
|
8 |
from fpdf import FPDF
|
9 |
from collections import Counter
|
10 |
|
11 |
+
# Usa Hugging Face tokenizer
|
12 |
+
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
app = Flask(__name__)
|
15 |
app.secret_key = os.environ.get("SECRET_KEY", "NORUS_secretkey_05")
|
|
|
32 |
return text.lower().strip()
|
33 |
|
34 |
def preprocess_text(text):
|
35 |
+
# Tokenizza il testo usando il tokenizer di Hugging Face
|
36 |
+
tokens = tokenizer.tokenize(text.lower())
|
37 |
+
return [token for token in tokens if len(token) > 3]
|
38 |
|
39 |
def calculate_token_overlap(text1, text2):
|
40 |
tokens1 = set(text1.split())
|