Ashendilantha commited on
Commit
bada648
Β·
verified Β·
1 Parent(s): a44885d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -38
app.py CHANGED
@@ -4,19 +4,20 @@ import re
4
  import string
5
  import nltk
6
  from nltk.corpus import stopwords
7
- from nltk.tokenize import word_tokenize
8
  from nltk.stem import WordNetLemmatizer
9
  from transformers import pipeline
10
  from PIL import Image
 
 
11
 
12
  # Download required NLTK data
13
  nltk.download('stopwords')
14
- nltk.download('punkt')
15
  nltk.download('wordnet')
 
16
 
17
  # Load Models
18
  news_classifier = pipeline("text-classification", model="Oneli/News_Classification")
19
- qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
20
 
21
  # Label Mapping
22
  label_mapping = {
@@ -35,11 +36,11 @@ def clean_text(text):
35
  text = text.lower()
36
  text = re.sub(f"[{string.punctuation}]", "", text) # Remove punctuation
37
  text = re.sub(r"[^a-zA-Z0-9\s]", "", text) # Remove special characters
38
- tokens = word_tokenize(text)
39
- tokens = [word for word in tokens if word not in stopwords.words("english")] # Remove stopwords
40
  lemmatizer = WordNetLemmatizer()
41
- tokens = [lemmatizer.lemmatize(word) for word in tokens] # Lemmatize tokens
42
- return " ".join(tokens)
43
 
44
  # Define the functions
45
  def classify_text(text):
@@ -59,8 +60,7 @@ def classify_csv(file):
59
  text_column = df.columns[0] # Assume first column is the text column
60
 
61
  df[text_column] = df[text_column].astype(str).apply(clean_text) # Clean text column
62
- df["Encoded Prediction"] = df[text_column].apply(lambda x: news_classifier(x)[0]['label'])
63
- df["Decoded Prediction"] = df["Encoded Prediction"].map(label_mapping)
64
  df["Confidence"] = df[text_column].apply(lambda x: round(news_classifier(x)[0]['score'] * 100, 2))
65
 
66
  # Store all text as a single context for QA
@@ -73,37 +73,46 @@ def classify_csv(file):
73
  except Exception as e:
74
  return None, f"Error: {str(e)}"
75
 
76
- def chatbot_response(history, user_input, source):
77
  user_input = user_input.lower()
78
- context = context_storage["context"] if source == "Single Article" else context_storage["bulk_context"]
79
- num_articles = context_storage["num_articles"]
80
 
81
- if "number of articles" in user_input or "how many articles" in user_input:
82
- answer = f"There are {num_articles} articles in the uploaded CSV."
83
- history.append([user_input, answer])
84
- return history, ""
 
 
85
 
86
  if context:
87
- result = qa_pipeline(question=user_input, context=context)
88
- answer = result["answer"]
89
- history.append([user_input, answer])
90
- return history, ""
91
 
92
- responses = {
93
- "hello": "πŸ‘‹ Hello! How can I assist you with news today?",
94
- "hi": "😊 Hi there! What do you want to know about news?",
95
- "how are you": "πŸ€– I'm just a bot, but I'm here to help!",
96
- "thank you": "πŸ™ You're welcome! Let me know if you need anything else.",
97
- "news": "πŸ“° I can classify news into Business, Sports, Politics, and more!",
98
- }
99
- response = responses.get(user_input, "πŸ€” I'm here to help with news classification and general info. Ask me about news topics!")
100
- history.append([user_input, response])
101
- return history, ""
 
 
 
 
 
 
 
 
102
 
103
  # Streamlit App Layout
104
  st.set_page_config(page_title="News Classifier", page_icon="πŸ“°")
105
  cover_image = Image.open("cover.png") # Ensure this image exists
106
- st.image(cover_image, caption="News Classifier πŸ“’", use_column_width=True)
107
 
108
  # Section for Single Article Classification
109
  st.subheader("πŸ“° Single Article Classification")
@@ -111,8 +120,12 @@ text_input = st.text_area("Enter News Text", placeholder="Type or paste news con
111
  if st.button("πŸ” Classify"):
112
  if text_input:
113
  category, confidence = classify_text(text_input)
114
- st.write(f"*Predicted Category:* {category}")
115
- st.write(f"*Confidence Level:* {confidence}")
 
 
 
 
116
  else:
117
  st.warning("Please enter some text to classify.")
118
 
@@ -129,6 +142,13 @@ if file_input:
129
  file_name=output_file,
130
  mime="text/csv"
131
  )
 
 
 
 
 
 
 
132
  else:
133
  st.error(f"Error processing file: {output_file}")
134
 
@@ -137,9 +157,18 @@ st.subheader("πŸ’¬ AI Chat Assistant")
137
  history = []
138
  user_input = st.text_input("Ask about news classification or topics", placeholder="Type a message...")
139
  source_toggle = st.radio("Select Context Source", ["Single Article", "Bulk Classification"])
 
140
  if st.button("βœ‰ Send"):
141
- history, bot_response = chatbot_response(history, user_input, source_toggle)
142
- st.write("*Chatbot Response:*")
143
- for q, a in history:
144
- st.write(f"*Q:* {q}")
145
- st.write(f"*A:* {a}")
 
 
 
 
 
 
 
 
 
4
  import string
5
  import nltk
6
  from nltk.corpus import stopwords
 
7
  from nltk.stem import WordNetLemmatizer
8
  from transformers import pipeline
9
  from PIL import Image
10
+ import matplotlib.pyplot as plt
11
+ from wordcloud import WordCloud
12
 
13
  # Download required NLTK data
14
  nltk.download('stopwords')
 
15
  nltk.download('wordnet')
16
+ nltk.download('omw-1.4')
17
 
18
  # Load Models
19
  news_classifier = pipeline("text-classification", model="Oneli/News_Classification")
20
+ qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
21
 
22
  # Label Mapping
23
  label_mapping = {
 
36
  text = text.lower()
37
  text = re.sub(f"[{string.punctuation}]", "", text) # Remove punctuation
38
  text = re.sub(r"[^a-zA-Z0-9\s]", "", text) # Remove special characters
39
+ words = text.split() # Tokenization without Punkt
40
+ words = [word for word in words if word not in stopwords.words("english")] # Remove stopwords
41
  lemmatizer = WordNetLemmatizer()
42
+ words = [lemmatizer.lemmatize(word) for word in words] # Lemmatize tokens
43
+ return " ".join(words)
44
 
45
  # Define the functions
46
  def classify_text(text):
 
60
  text_column = df.columns[0] # Assume first column is the text column
61
 
62
  df[text_column] = df[text_column].astype(str).apply(clean_text) # Clean text column
63
+ df["Decoded Prediction"] = df[text_column].apply(lambda x: label_mapping.get(news_classifier(x)[0]['label'], "Unknown"))
 
64
  df["Confidence"] = df[text_column].apply(lambda x: round(news_classifier(x)[0]['score'] * 100, 2))
65
 
66
  # Store all text as a single context for QA
 
73
  except Exception as e:
74
  return None, f"Error: {str(e)}"
75
 
76
+ def chatbot_response(history, user_input, text_input=None, file_input=None):
77
  user_input = user_input.lower()
78
+ context = ""
 
79
 
80
+ if text_input:
81
+ context += text_input
82
+
83
+ if file_input:
84
+ df, _ = classify_csv(file_input)
85
+ context += context_storage["bulk_context"]
86
 
87
  if context:
88
+ with st.spinner("Finding answer..."):
89
+ result = qa_pipeline(question=user_input, context=context)
90
+ answer = result["answer"]
91
+ history.append([user_input, answer])
92
 
93
+ return history, answer
94
+
95
+ # Function to generate word cloud from the 'content' column (from output CSV)
96
+ def generate_word_cloud_from_output(df):
97
+ # Assuming 'content' column is the first column after processing
98
+ content_text = " ".join(df["content"].dropna().astype(str).tolist())
99
+ wordcloud = WordCloud(width=800, height=400, background_color="white").generate(content_text)
100
+ return wordcloud
101
+
102
+ # Function to generate bar graph for decoded predictions
103
+ def generate_bar_graph(df):
104
+ prediction_counts = df["Decoded Prediction"].value_counts()
105
+ fig, ax = plt.subplots(figsize=(10, 6))
106
+ prediction_counts.plot(kind='bar', ax=ax, color='skyblue')
107
+ ax.set_title('Frequency of Decoded Predictions', fontsize=16)
108
+ ax.set_xlabel('Category', fontsize=12)
109
+ ax.set_ylabel('Frequency', fontsize=12)
110
+ st.pyplot(fig)
111
 
112
  # Streamlit App Layout
113
  st.set_page_config(page_title="News Classifier", page_icon="πŸ“°")
114
  cover_image = Image.open("cover.png") # Ensure this image exists
115
+ st.image(cover_image, caption="News Classifier πŸ“’", use_container_width=True)
116
 
117
  # Section for Single Article Classification
118
  st.subheader("πŸ“° Single Article Classification")
 
120
  if st.button("πŸ” Classify"):
121
  if text_input:
122
  category, confidence = classify_text(text_input)
123
+ st.write(f"Predicted Category: {category}")
124
+ st.write(f"Confidence Level: {confidence}")
125
+
126
+ # Generate word cloud for the cleaned text input
127
+ wordcloud = generate_word_cloud_from_output(pd.DataFrame({"content": [text_input]})) # Create a DataFrame for single input
128
+ st.image(wordcloud.to_array(), caption="Word Cloud for Text Input", use_container_width=True)
129
  else:
130
  st.warning("Please enter some text to classify.")
131
 
 
142
  file_name=output_file,
143
  mime="text/csv"
144
  )
145
+
146
+ # Generate word cloud for the 'content' column of the processed CSV data
147
+ wordcloud = generate_word_cloud_from_output(df)
148
+ st.image(wordcloud.to_array(), caption="Word Cloud for CSV Content", use_container_width=True)
149
+
150
+ # Generate bar graph for decoded predictions frequency
151
+ generate_bar_graph(df)
152
  else:
153
  st.error(f"Error processing file: {output_file}")
154
 
 
157
  history = []
158
  user_input = st.text_input("Ask about news classification or topics", placeholder="Type a message...")
159
  source_toggle = st.radio("Select Context Source", ["Single Article", "Bulk Classification"])
160
+
161
  if st.button("βœ‰ Send"):
162
+ if not user_input and not file_input:
163
+ st.warning("Please upload your file or provide text input for QA.")
164
+ else:
165
+ history, bot_response = chatbot_response(
166
+ history,
167
+ user_input,
168
+ text_input=text_input if source_toggle == "Single Article" else None,
169
+ file_input=file_input if source_toggle == "Bulk Classification" else None
170
+ )
171
+ st.write("Chatbot Response:")
172
+ for q, a in history:
173
+ st.write(f"Q: {q}")
174
+ st.write(f"A: {a}")