TAgroup5 commited on
Commit
ff72b25
·
verified ·
1 Parent(s): 08acf50

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -19
app.py CHANGED
@@ -6,9 +6,9 @@ import string
6
  from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
7
  from nltk.tokenize import word_tokenize
8
  from nltk.corpus import stopwords
9
- from nltk.stem import WordNetLemmatizer
10
- import nltk
11
  import os
 
 
12
 
13
  # Specify the directory for nltk_data
14
  nltk_data_dir = '/root/nltk_data'
@@ -28,21 +28,10 @@ try:
28
  except Exception as e:
29
  print(f"Error while downloading NLTK resources: {e}")
30
 
31
- # Proceed with your application code
32
- from nltk.tokenize import word_tokenize
33
- from nltk.corpus import stopwords
34
- from nltk.stem import WordNetLemmatizer
35
-
36
- # Initialize lemmatizer and stopwords
37
- lemmatizer = WordNetLemmatizer()
38
- stop_words = set(stopwords.words('english'))
39
-
40
-
41
-
42
  # Load fine-tuned model and tokenizer (adjust the model name)
43
  model_name = "TAgroup5/news-classification-model" # Replace with the correct model name
44
- model = AutoModelForSequenceClassification.from_pretrained(model_name)
45
- tokenizer = AutoTokenizer.from_pretrained(model_name)
46
 
47
  # Initialize pipelines
48
  text_classification_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)
@@ -72,10 +61,9 @@ if uploaded_file is not None:
72
  def preprocess_text(text):
73
  text = text.lower() # Convert to lowercase
74
  text = re.sub(r'[^a-z\s]', '', text) # Remove special characters & numbers
75
- tokens = word_tokenize(text) # Tokenization
76
- tokens = [word for word in tokens if word not in stop_words] # Remove stopwords
77
- tokens = [lemmatizer.lemmatize(word) for word in tokens] # Lemmatization
78
- return " ".join(tokens)
79
 
80
  # Apply preprocessing and classification
81
  df['processed_content'] = df['content'].apply(preprocess_text)
 
6
  from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
7
  from nltk.tokenize import word_tokenize
8
  from nltk.corpus import stopwords
 
 
9
  import os
10
+ import nltk
11
+ from nltk.stem import WordNetLemmatizer
12
 
13
  # Specify the directory for nltk_data
14
  nltk_data_dir = '/root/nltk_data'
 
28
  except Exception as e:
29
  print(f"Error while downloading NLTK resources: {e}")
30
 
 
 
 
 
 
 
 
 
 
 
 
31
  # Load fine-tuned model and tokenizer (adjust the model name)
32
  model_name = "TAgroup5/news-classification-model" # Replace with the correct model name
33
+ model = AutoModelForSequenceClassification.from_pretrained(news-classification-model)
34
+ tokenizer = AutoTokenizer.from_pretrained(news-classification-model)
35
 
36
  # Initialize pipelines
37
  text_classification_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)
 
61
  def preprocess_text(text):
62
  text = text.lower() # Convert to lowercase
63
  text = re.sub(r'[^a-z\s]', '', text) # Remove special characters & numbers
64
+ # You don't need tokenization here, as the model tokenizer will handle it
65
+ return text
66
+
 
67
 
68
  # Apply preprocessing and classification
69
  df['processed_content'] = df['content'].apply(preprocess_text)