Spaces:
Sleeping
Sleeping
import streamlit as st | |
import numpy as np | |
import pandas as pd | |
import chardet | |
import matplotlib.pyplot as plt | |
from laser_encoders import LaserEncoderPipeline | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import accuracy_score | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.preprocessing import LabelEncoder | |
from tensorflow.keras.models import Sequential | |
from tensorflow.keras.layers import Dense | |
from tqdm import tqdm | |
with open('./train.csv', 'rb') as f: | |
result = chardet.detect(f.read()) | |
# Use the detected encoding when reading the CSV file | |
data = pd.read_csv('./train.csv', encoding=result['encoding']) | |
data = data[['sentiment', 'text']] | |
sentiments = [] | |
texts = [] | |
for index, row in data.iterrows(): | |
sentiment = row['sentiment'].lower() # Convert to lowercase for case-insensitivity | |
if sentiment == 'neutral': | |
sentiments.append(1) | |
elif sentiment == 'positive': | |
sentiments.append(2) | |
elif sentiment == 'negative': | |
sentiments.append(3) | |
else: | |
# Handle the case where sentiment is not one of the expected values | |
# You may choose to skip this row or handle it differently based on your requirements | |
print(f"Warning: Unknown sentiment '{sentiment}' in row {index}") | |
continue # Skip the rest of the loop for this row | |
text = row['text'] | |
if not isinstance(text, float): | |
texts.append(text) | |
else: | |
# Skip the sentiment for this row as well | |
print(f"Warning: Skipping row {index} with float text value") | |
sentiments.pop() # Remove the last added sentiment | |
label_encoder = LabelEncoder() | |
encoded_sentiments = label_encoder.fit_transform(sentiments) | |
# Split the data into training and testing sets | |
X_train, X_test, y_train, y_test = train_test_split(texts, encoded_sentiments, test_size=0.2, random_state=42) | |
# Initialize the LaserEncoder | |
encoder = LaserEncoderPipeline(lang="eng_Latn") | |
# Initialize empty arrays to store embeddings | |
X_train_embeddings = [] | |
X_test_embeddings = [] | |
for sentence in tqdm(X_train): | |
embeddings = encoder.encode_sentences([sentence])[0] | |
X_train_embeddings.append(embeddings) | |
for sentence in tqdm(X_test): | |
embeddings = encoder.encode_sentences([sentence])[0] | |
X_test_embeddings.append(embeddings) | |
# Convert lists to numpy arrays | |
X_train_embeddings = np.array(X_train_embeddings) | |
X_test_embeddings = np.array(X_test_embeddings) | |
# Sentiment Prediction with RNN Neural Network and Confusion Matrix | |
from keras.models import Sequential | |
from keras.layers import Dense, SimpleRNN, Reshape, Dropout | |
from keras.optimizers import Adam | |
from keras.callbacks import LearningRateScheduler | |
from sklearn.metrics import confusion_matrix | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
import numpy as np | |
# Build a neural network model with RNN | |
model = Sequential() | |
model.add(Dense(256, input_shape=(1024,), activation='tanh')) | |
model.add(Reshape((1, 256))) | |
model.add(SimpleRNN(128, activation='relu')) | |
model.add(Dense(64, activation='relu')) | |
model.add(Dropout(0.5)) # Adding dropout for regularization | |
model.add(Dense(3, activation='softmax')) | |
# Use a learning rate scheduler | |
def lr_schedule(epoch): | |
return 0.0001 * 0.9 ** epoch | |
opt = Adam(learning_rate=0.0001) | |
lr_scheduler = LearningRateScheduler(lr_schedule) | |
# | |
# Compile the model | |
model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy']) | |
# Print model summary to check the architecture | |
model.summary() | |
# Train the model with the learning rate scheduler | |
model.fit(X_train_embeddings, y_train, epochs=30, batch_size=32, validation_split=0.1, callbacks=[lr_scheduler]) | |
# Evaluate the model on the test set | |
accuracy = model.evaluate(X_test_embeddings, y_test)[1] | |
# Predictions on the test set | |
y_pred_probabilities = model.predict(X_test_embeddings) | |
y_pred = np.argmax(y_pred_probabilities, axis=1) | |
language = st.slider('Enter the language:') | |
user_text = st.slider('Enter the text:') | |
encoder = LaserEncoderPipeline(lang=language) | |
user_text_embedding = encoder.encode_sentences([user_text])[0] | |
user_text_embedding = np.reshape(user_text_embedding, (1, -1)) | |
predicted_sentiment = np.argmax(model.predict(user_text_embedding)) | |
predicted_sentiment_no = label_encoder.inverse_transform([predicted_sentiment])[0] | |
if predicted_sentiment_no == 1: | |
predicted_sentiment_label = 'neutral' | |
elif predicted_sentiment_no == 2: | |
predicted_sentiment_label = 'positive' | |
else: | |
predicted_sentiment_label = 'negative' | |
st.write("Predicted Sentiment:"+predicted_sentiment_label) | |