File size: 1,588 Bytes
dc3c28f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import pandas as pd
import warnings
import pickle
warnings.filterwarnings('ignore')
# Load dataset
df = pd.read_csv('./twitter_sentiment.csv', header=None, index_col=0)
df = df[[2, 3]].reset_index(drop=True)
df.rename(columns={2: "sentiments", 3: "text"}, inplace=True)
# Drop missing values
df.dropna(inplace=True)
# Preprocess text
def process_text(text):
text = text.lower()
text = re.sub(r'http\S+', '', text)
text = re.sub(r'@[a-zA-Z0-9_]+', '', text)
text = re.sub(r'#', '', text)
text = re.sub(r'[^a-zA-Z\s]', '', text)
return text
df['clean_text'] = df['text'].apply(process_text)
# Vectorize text
count_vectorizer = CountVectorizer(max_features=5000)
count_matrix = count_vectorizer.fit_transform(df['clean_text'])
# Split data
X_train, X_test, y_train, y_test = train_test_split(
count_matrix, df['sentiments'], test_size=0.2, random_state=42
)
# Train model
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)
# Predict and evaluate
y_pred = nb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
with open('count_vectorizer.pkl','wb') as vectorizer_file:
pickle.dump(count_vectorizer , vectorizer_file)
with open('nb_classifier.pkl','wb') as classifier_file:
pickle.dump(nb_classifier , classifier_file) |