TAgroup5 commited on
Commit
66d1db7
·
verified ·
1 Parent(s): ff72b25

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -24
app.py CHANGED
@@ -2,31 +2,8 @@ import streamlit as st
2
  import pandas as pd
3
  import re
4
  import io
5
- import string
6
  from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
7
- from nltk.tokenize import word_tokenize
8
- from nltk.corpus import stopwords
9
- import os
10
- import nltk
11
- from nltk.stem import WordNetLemmatizer
12
-
13
- # Specify the directory for nltk_data
14
- nltk_data_dir = '/root/nltk_data'
15
-
16
- # Ensure the directory exists
17
- if not os.path.exists(nltk_data_dir):
18
- os.makedirs(nltk_data_dir)
19
-
20
- # Set the NLTK data path explicitly
21
- nltk.data.path.append(nltk_data_dir)
22
-
23
- # Try downloading required NLTK resources
24
- try:
25
- nltk.download('punkt', download_dir=nltk_data_dir)
26
- nltk.download('stopwords', download_dir=nltk_data_dir)
27
- nltk.download('wordnet', download_dir=nltk_data_dir)
28
- except Exception as e:
29
- print(f"Error while downloading NLTK resources: {e}")
30
 
31
  # Load fine-tuned model and tokenizer (adjust the model name)
32
  model_name = "TAgroup5/news-classification-model" # Replace with the correct model name
@@ -35,6 +12,8 @@ tokenizer = AutoTokenizer.from_pretrained(news-classification-model)
35
 
36
  # Initialize pipelines
37
  text_classification_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)
 
 
38
 
39
  # Streamlit App
40
  st.title("News Classification and Q&A")
@@ -60,6 +39,7 @@ if uploaded_file is not None:
60
  # Preprocessing function to clean the text
61
  def preprocess_text(text):
62
  text = text.lower() # Convert to lowercase
 
63
  text = re.sub(r'[^a-z\s]', '', text) # Remove special characters & numbers
64
  # You don't need tokenization here, as the model tokenizer will handle it
65
  return text
 
2
  import pandas as pd
3
  import re
4
  import io
 
5
  from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
6
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  # Load fine-tuned model and tokenizer (adjust the model name)
9
  model_name = "TAgroup5/news-classification-model" # Replace with the correct model name
 
12
 
13
  # Initialize pipelines
14
  text_classification_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)
15
+ qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
16
+
17
 
18
  # Streamlit App
19
  st.title("News Classification and Q&A")
 
39
  # Preprocessing function to clean the text
40
  def preprocess_text(text):
41
  text = text.lower() # Convert to lowercase
42
+ text = re.sub(r'\s+', ' ', text) # Remove extra spaces
43
  text = re.sub(r'[^a-z\s]', '', text) # Remove special characters & numbers
44
  # You don't need tokenization here, as the model tokenizer will handle it
45
  return text