TAgroup5 commited on
Commit
ff871dc
·
verified ·
1 Parent(s): 5cdc45c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -5
app.py CHANGED
@@ -5,7 +5,7 @@ import io
5
  from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
6
 
7
  # Load fine-tuned model and tokenizer
8
- model_name = "TAgroup5/daily-mirror-news-classifier"
9
  model = AutoModelForSequenceClassification.from_pretrained(model_name)
10
  tokenizer = AutoTokenizer.from_pretrained(model_name)
11
 
@@ -36,10 +36,12 @@ if uploaded_file is not None:
36
 
37
  # Preprocessing function
38
  def preprocess_text(text):
39
- text = text.lower() # Ensure consistent casing
40
- text = re.sub(r'\s+', ' ', text) # Remove extra spaces
41
- text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # Remove special characters
42
- return text
 
 
43
 
44
  # Apply preprocessing and classification
45
  df['processed_content'] = df['content'].apply(preprocess_text)
 
5
  from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
6
 
7
  # Load fine-tuned model and tokenizer
8
+ model_name = "TAgroup5/news-classification-model"
9
  model = AutoModelForSequenceClassification.from_pretrained(model_name)
10
  tokenizer = AutoTokenizer.from_pretrained(model_name)
11
 
 
36
 
37
  # Preprocessing function
38
  def preprocess_text(text):
39
+ text = text.lower() # Convert to lowercase
40
+ text = re.sub(r'[^a-z\s]', '', text) # Remove special characters & numbers
41
+ tokens = word_tokenize(text) # Tokenization
42
+ tokens = [word for word in tokens if word not in stop_words] # Remove stopwords
43
+ tokens = [lemmatizer.lemmatize(word) for word in tokens] # Lemmatization
44
+ return " ".join(tokens)
45
 
46
  # Apply preprocessing and classification
47
  df['processed_content'] = df['content'].apply(preprocess_text)