Spaces:

TAgroup5
/

demo-News_classifier

Sleeping

App Files Files Community

TAgroup5 commited on Mar 28

Commit

ff72b25

verified ·

1 Parent(s): 08acf50

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -19

app.py CHANGED Viewed

@@ -6,9 +6,9 @@ import string
 from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
 from nltk.tokenize import word_tokenize
 from nltk.corpus import stopwords
-from nltk.stem import WordNetLemmatizer
-import nltk
 import os
 # Specify the directory for nltk_data
 nltk_data_dir = '/root/nltk_data'
@@ -28,21 +28,10 @@ try:
 except Exception as e:
     print(f"Error while downloading NLTK resources: {e}")
-# Proceed with your application code
-from nltk.tokenize import word_tokenize
-from nltk.corpus import stopwords
-from nltk.stem import WordNetLemmatizer
-# Initialize lemmatizer and stopwords
-lemmatizer = WordNetLemmatizer()
-stop_words = set(stopwords.words('english'))
 # Load fine-tuned model and tokenizer (adjust the model name)
 model_name = "TAgroup5/news-classification-model"  # Replace with the correct model name
-model = AutoModelForSequenceClassification.from_pretrained(model_name)
-tokenizer = AutoTokenizer.from_pretrained(model_name)
 # Initialize pipelines
 text_classification_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)
@@ -72,10 +61,9 @@ if uploaded_file is not None:
         def preprocess_text(text):
             text = text.lower()  # Convert to lowercase
             text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters & numbers
-            tokens = word_tokenize(text)  # Tokenization
-            tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
-            tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
-            return " ".join(tokens)
         # Apply preprocessing and classification
         df['processed_content'] = df['content'].apply(preprocess_text)

 from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
 from nltk.tokenize import word_tokenize
 from nltk.corpus import stopwords
 import os
+import nltk
+from nltk.stem import WordNetLemmatizer
 # Specify the directory for nltk_data
 nltk_data_dir = '/root/nltk_data'
 except Exception as e:
     print(f"Error while downloading NLTK resources: {e}")
 # Load fine-tuned model and tokenizer (adjust the model name)
 model_name = "TAgroup5/news-classification-model"  # Replace with the correct model name
+model = AutoModelForSequenceClassification.from_pretrained(news-classification-model)
+tokenizer = AutoTokenizer.from_pretrained(news-classification-model)
 # Initialize pipelines
 text_classification_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)
         def preprocess_text(text):
             text = text.lower()  # Convert to lowercase
             text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters & numbers
+            # You don't need tokenization here, as the model tokenizer will handle it
+            return text
         # Apply preprocessing and classification
         df['processed_content'] = df['content'].apply(preprocess_text)