mabil commited on
Commit
dc71ffd
·
1 Parent(s): fd22707

Fix: Replaced NLTK tokenizer with Hugging Face AutoTokenizer

Browse files
Files changed (1) hide show
  1. app.py +6 -17
app.py CHANGED
@@ -4,23 +4,12 @@ import pdfplumber
4
  from flask import Flask, render_template, request, redirect, url_for, flash, send_file
5
  from werkzeug.utils import secure_filename
6
  from sentence_transformers import SentenceTransformer, util
7
- import nltk
8
- from nltk.stem import WordNetLemmatizer, PorterStemmer
9
- from nltk.tokenize import word_tokenize
10
- from nltk.corpus import stopwords
11
  from fpdf import FPDF
12
  from collections import Counter
13
 
14
- # Download risorse NLTK (incluso punkt)
15
- nltk.download('punkt', download_dir='/home/user/nltk_data')
16
- nltk.download('wordnet', download_dir='/home/user/nltk_data')
17
- nltk.download('stopwords', download_dir='/home/user/nltk_data')
18
-
19
- nltk.data.path.append("/home/user/nltk_data")
20
-
21
- lemmatizer = WordNetLemmatizer()
22
- stemmer = PorterStemmer()
23
- stop_words = set(stopwords.words("english"))
24
 
25
  app = Flask(__name__)
26
  app.secret_key = os.environ.get("SECRET_KEY", "NORUS_secretkey_05")
@@ -43,9 +32,9 @@ def extract_pdf_text(pdf_path):
43
  return text.lower().strip()
44
 
45
  def preprocess_text(text):
46
- words = word_tokenize(text.lower())
47
- words = [stemmer.stem(lemmatizer.lemmatize(w)) for w in words if w.isalnum() and w not in stop_words and len(w) > 3]
48
- return words
49
 
50
  def calculate_token_overlap(text1, text2):
51
  tokens1 = set(text1.split())
 
4
  from flask import Flask, render_template, request, redirect, url_for, flash, send_file
5
  from werkzeug.utils import secure_filename
6
  from sentence_transformers import SentenceTransformer, util
7
+ from transformers import AutoTokenizer
 
 
 
8
  from fpdf import FPDF
9
  from collections import Counter
10
 
11
+ # Usa Hugging Face tokenizer
12
+ tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
 
 
 
 
 
 
 
 
13
 
14
  app = Flask(__name__)
15
  app.secret_key = os.environ.get("SECRET_KEY", "NORUS_secretkey_05")
 
32
  return text.lower().strip()
33
 
34
  def preprocess_text(text):
35
+ # Tokenizza il testo usando il tokenizer di Hugging Face
36
+ tokens = tokenizer.tokenize(text.lower())
37
+ return [token for token in tokens if len(token) > 3]
38
 
39
  def calculate_token_overlap(text1, text2):
40
  tokens1 = set(text1.split())