NIXBLACK's picture
Create app.py
2bc2000
raw
history blame
4.58 kB
import streamlit as st
import numpy as np
import pandas as pd
import chardet
import matplotlib.pyplot as plt
from laser_encoders import LaserEncoderPipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tqdm import tqdm
with open('./train.csv', 'rb') as f:
result = chardet.detect(f.read())
# Use the detected encoding when reading the CSV file
data = pd.read_csv('./train.csv', encoding=result['encoding'])
data = data[['sentiment', 'text']]
sentiments = []
texts = []
for index, row in data.iterrows():
sentiment = row['sentiment'].lower() # Convert to lowercase for case-insensitivity
if sentiment == 'neutral':
sentiments.append(1)
elif sentiment == 'positive':
sentiments.append(2)
elif sentiment == 'negative':
sentiments.append(3)
else:
# Handle the case where sentiment is not one of the expected values
# You may choose to skip this row or handle it differently based on your requirements
print(f"Warning: Unknown sentiment '{sentiment}' in row {index}")
continue # Skip the rest of the loop for this row
text = row['text']
if not isinstance(text, float):
texts.append(text)
else:
# Skip the sentiment for this row as well
print(f"Warning: Skipping row {index} with float text value")
sentiments.pop() # Remove the last added sentiment
label_encoder = LabelEncoder()
encoded_sentiments = label_encoder.fit_transform(sentiments)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, encoded_sentiments, test_size=0.2, random_state=42)
# Initialize the LaserEncoder
encoder = LaserEncoderPipeline(lang="eng_Latn")
# Initialize empty arrays to store embeddings
X_train_embeddings = []
X_test_embeddings = []
for sentence in tqdm(X_train):
embeddings = encoder.encode_sentences([sentence])[0]
X_train_embeddings.append(embeddings)
for sentence in tqdm(X_test):
embeddings = encoder.encode_sentences([sentence])[0]
X_test_embeddings.append(embeddings)
# Convert lists to numpy arrays
X_train_embeddings = np.array(X_train_embeddings)
X_test_embeddings = np.array(X_test_embeddings)
# Sentiment Prediction with RNN Neural Network and Confusion Matrix
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN, Reshape, Dropout
from keras.optimizers import Adam
from keras.callbacks import LearningRateScheduler
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
# Build a neural network model with RNN
model = Sequential()
model.add(Dense(256, input_shape=(1024,), activation='tanh'))
model.add(Reshape((1, 256)))
model.add(SimpleRNN(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5)) # Adding dropout for regularization
model.add(Dense(3, activation='softmax'))
# Use a learning rate scheduler
def lr_schedule(epoch):
return 0.0001 * 0.9 ** epoch
opt = Adam(learning_rate=0.0001)
lr_scheduler = LearningRateScheduler(lr_schedule)
#
# Compile the model
model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Print model summary to check the architecture
model.summary()
# Train the model with the learning rate scheduler
model.fit(X_train_embeddings, y_train, epochs=30, batch_size=32, validation_split=0.1, callbacks=[lr_scheduler])
# Evaluate the model on the test set
accuracy = model.evaluate(X_test_embeddings, y_test)[1]
# Predictions on the test set
y_pred_probabilities = model.predict(X_test_embeddings)
y_pred = np.argmax(y_pred_probabilities, axis=1)
language = st.slider('Enter the language:')
user_text = st.slider('Enter the text:')
encoder = LaserEncoderPipeline(lang=language)
user_text_embedding = encoder.encode_sentences([user_text])[0]
user_text_embedding = np.reshape(user_text_embedding, (1, -1))
predicted_sentiment = np.argmax(model.predict(user_text_embedding))
predicted_sentiment_no = label_encoder.inverse_transform([predicted_sentiment])[0]
if predicted_sentiment_no == 1:
predicted_sentiment_label = 'neutral'
elif predicted_sentiment_no == 2:
predicted_sentiment_label = 'positive'
else:
predicted_sentiment_label = 'negative'
st.write("Predicted Sentiment:"+predicted_sentiment_label)