Spaces:
Running
Running
import streamlit as st | |
import pandas as pd | |
import time | |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer | |
from transformers import pipeline, MarianMTModel, MarianTokenizer | |
import matplotlib.pyplot as plt | |
from pymystem3 import Mystem | |
import io | |
from rapidfuzz import fuzz | |
# Initialize components (VADER, FinBERT, RoBERTa, FinBERT-Tone, Mystem, translation model) | |
# (Copy the initialization code from your original script) | |
# Define helper functions (lemmatize_text, translate, get_vader_sentiment...) | |
# (Copy these functions from your original script) | |
def process_file(uploaded_file): | |
df = pd.read_excel(uploaded_file, sheet_name='Публикации') | |
# Apply fuzzy deduplication | |
df = df.groupby('Объект').apply(lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)).reset_index(drop=True) | |
# Translate texts | |
translated_texts = [] | |
progress_bar = st.progress(0) | |
for i, text in enumerate(df['Выдержки из текста']): | |
translated_text = translate(str(text)) | |
translated_texts.append(translated_text) | |
progress_bar.progress((i + 1) / len(df)) | |
# Perform sentiment analysis | |
vader_results = [get_vader_sentiment(text) for text in translated_texts] | |
finbert_results = [get_finbert_sentiment(text) for text in translated_texts] | |
roberta_results = [get_roberta_sentiment(text) for text in translated_texts] | |
finbert_tone_results = [get_finbert_tone_sentiment(text) for text in translated_texts] | |
# Add results to DataFrame | |
df['VADER'] = vader_results | |
df['FinBERT'] = finbert_results | |
df['RoBERTa'] = roberta_results | |
df['FinBERT-Tone'] = finbert_tone_results | |
# Reorder columns | |
columns_order = ['Объект', 'VADER', 'FinBERT', 'RoBERTa', 'FinBERT-Tone', 'Выдержки из текста'] | |
df = df[columns_order] | |
return df | |
def main(): | |
st.title("Sentiment Analysis App") | |
uploaded_file = st.file_uploader("Choose an Excel file", type="xlsx") | |
if uploaded_file is not None: | |
df = process_file(uploaded_file) | |
st.subheader("Data Preview") | |
st.write(df.head()) | |
st.subheader("Sentiment Distribution") | |
fig, axs = plt.subplots(2, 2, figsize=(12, 8)) | |
fig.suptitle("Sentiment Distribution for Each Model") | |
models = ['VADER', 'FinBERT', 'RoBERTa', 'FinBERT-Tone'] | |
for i, model in enumerate(models): | |
ax = axs[i // 2, i % 2] | |
sentiment_counts = df[model].value_counts() | |
sentiment_counts.plot(kind='bar', ax=ax) | |
ax.set_title(f"{model} Sentiment") | |
ax.set_xlabel("Sentiment") | |
ax.set_ylabel("Count") | |
plt.tight_layout() | |
st.pyplot(fig) | |
# Offer download of results | |
output = io.BytesIO() | |
with pd.ExcelWriter(output, engine='openpyxl') as writer: | |
df.to_excel(writer, index=False) | |
output.seek(0) | |
st.download_button( | |
label="Download results as Excel", | |
data=output, | |
file_name="sentiment_analysis_results.xlsx", | |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | |
) | |
if __name__ == "__main__": | |
main() |