Spaces:
Running
Running
Commit
·
d851af8
1
Parent(s):
cd09d80
all defs in order
Browse files- app.py +66 -4
- hf-streamlit-app.py +0 -89
app.py
CHANGED
@@ -9,12 +9,74 @@ from pymystem3 import Mystem
|
|
9 |
import io
|
10 |
from rapidfuzz import fuzz
|
11 |
|
12 |
-
# Initialize
|
|
|
13 |
|
14 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
-
# Define helper functions (lemmatize_text, translate, get_vader_sentiment...)
|
17 |
-
# (Copy these functions from your original script)
|
18 |
|
19 |
def process_file(uploaded_file):
|
20 |
df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
|
|
9 |
import io
|
10 |
from rapidfuzz import fuzz
|
11 |
|
12 |
+
# Initialize pymystem3 for lemmatization
|
13 |
+
mystem = Mystem()
|
14 |
|
15 |
+
# Set up the sentiment analyzers
|
16 |
+
vader_analyzer = SentimentIntensityAnalyzer()
|
17 |
+
finbert = pipeline("sentiment-analysis", model="ProsusAI/finbert")
|
18 |
+
roberta = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")
|
19 |
+
finbert_tone = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone")
|
20 |
+
|
21 |
+
# Function for lemmatizing Russian text
|
22 |
+
def lemmatize_text(text):
|
23 |
+
lemmatized_text = ''.join(mystem.lemmatize(text))
|
24 |
+
return lemmatized_text
|
25 |
+
|
26 |
+
# Translation model for Russian to English
|
27 |
+
model_name = "Helsinki-NLP/opus-mt-ru-en"
|
28 |
+
translation_tokenizer = MarianTokenizer.from_pretrained(model_name)
|
29 |
+
translation_model = MarianMTModel.from_pretrained(model_name)
|
30 |
+
|
31 |
+
def translate(text):
|
32 |
+
inputs = translation_tokenizer(text, return_tensors="pt", truncation=True)
|
33 |
+
translated_tokens = translation_model.generate(**inputs)
|
34 |
+
return translation_tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
|
35 |
+
|
36 |
+
# Function for VADER sentiment analysis with label mapping
|
37 |
+
def get_vader_sentiment(text):
|
38 |
+
score = vader_analyzer.polarity_scores(text)["compound"]
|
39 |
+
if score > 0.2:
|
40 |
+
return "Positive"
|
41 |
+
elif score < -0.2:
|
42 |
+
return "Negative"
|
43 |
+
return "Neutral"
|
44 |
+
|
45 |
+
# Functions for FinBERT, RoBERTa, and FinBERT-Tone with label mapping
|
46 |
+
def get_mapped_sentiment(result):
|
47 |
+
label = result['label'].lower()
|
48 |
+
if label in ["positive", "label_2", "pos", "pos_label"]:
|
49 |
+
return "Positive"
|
50 |
+
elif label in ["negative", "label_0", "neg", "neg_label"]:
|
51 |
+
return "Negative"
|
52 |
+
return "Neutral"
|
53 |
+
|
54 |
+
def get_finbert_sentiment(text):
|
55 |
+
result = finbert(text, truncation=True, max_length=512)[0]
|
56 |
+
return get_mapped_sentiment(result)
|
57 |
+
|
58 |
+
def get_roberta_sentiment(text):
|
59 |
+
result = roberta(text, truncation=True, max_length=512)[0]
|
60 |
+
return get_mapped_sentiment(result)
|
61 |
+
|
62 |
+
def get_finbert_tone_sentiment(text):
|
63 |
+
result = finbert_tone(text, truncation=True, max_length=512)[0]
|
64 |
+
return get_mapped_sentiment(result)
|
65 |
+
|
66 |
+
#Fuzzy filter out similar news for the same NER
|
67 |
+
def fuzzy_deduplicate(df, column, threshold=65):
|
68 |
+
seen_texts = []
|
69 |
+
indices_to_keep = []
|
70 |
+
for i, text in enumerate(df[column]):
|
71 |
+
if pd.isna(text):
|
72 |
+
indices_to_keep.append(i)
|
73 |
+
continue
|
74 |
+
text = str(text)
|
75 |
+
if not seen_texts or all(fuzz.ratio(text, seen) < threshold for seen in seen_texts):
|
76 |
+
seen_texts.append(text)
|
77 |
+
indices_to_keep.append(i)
|
78 |
+
return df.iloc[indices_to_keep]
|
79 |
|
|
|
|
|
80 |
|
81 |
def process_file(uploaded_file):
|
82 |
df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
hf-streamlit-app.py
DELETED
@@ -1,89 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
import pandas as pd
|
3 |
-
import time
|
4 |
-
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
5 |
-
from transformers import pipeline, MarianMTModel, MarianTokenizer
|
6 |
-
import matplotlib.pyplot as plt
|
7 |
-
from pymystem3 import Mystem
|
8 |
-
import io
|
9 |
-
from rapidfuzz import fuzz
|
10 |
-
|
11 |
-
# Initialize components (VADER, FinBERT, RoBERTa, FinBERT-Tone, Mystem, translation model)
|
12 |
-
# (Copy the initialization code from your original script)
|
13 |
-
|
14 |
-
# Define helper functions (lemmatize_text, translate, get_vader_sentiment, etc.)
|
15 |
-
# (Copy these functions from your original script)
|
16 |
-
|
17 |
-
def process_file(uploaded_file):
|
18 |
-
df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
19 |
-
|
20 |
-
# Apply fuzzy deduplication
|
21 |
-
df = df.groupby('Объект').apply(lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)).reset_index(drop=True)
|
22 |
-
|
23 |
-
# Translate texts
|
24 |
-
translated_texts = []
|
25 |
-
progress_bar = st.progress(0)
|
26 |
-
for i, text in enumerate(df['Выдержки из текста']):
|
27 |
-
translated_text = translate(str(text))
|
28 |
-
translated_texts.append(translated_text)
|
29 |
-
progress_bar.progress((i + 1) / len(df))
|
30 |
-
|
31 |
-
# Perform sentiment analysis
|
32 |
-
vader_results = [get_vader_sentiment(text) for text in translated_texts]
|
33 |
-
finbert_results = [get_finbert_sentiment(text) for text in translated_texts]
|
34 |
-
roberta_results = [get_roberta_sentiment(text) for text in translated_texts]
|
35 |
-
finbert_tone_results = [get_finbert_tone_sentiment(text) for text in translated_texts]
|
36 |
-
|
37 |
-
# Add results to DataFrame
|
38 |
-
df['VADER'] = vader_results
|
39 |
-
df['FinBERT'] = finbert_results
|
40 |
-
df['RoBERTa'] = roberta_results
|
41 |
-
df['FinBERT-Tone'] = finbert_tone_results
|
42 |
-
|
43 |
-
# Reorder columns
|
44 |
-
columns_order = ['Объект', 'VADER', 'FinBERT', 'RoBERTa', 'FinBERT-Tone', 'Выдержки из текста']
|
45 |
-
df = df[columns_order]
|
46 |
-
|
47 |
-
return df
|
48 |
-
|
49 |
-
def main():
|
50 |
-
st.title("Sentiment Analysis App")
|
51 |
-
|
52 |
-
uploaded_file = st.file_uploader("Choose an Excel file", type="xlsx")
|
53 |
-
|
54 |
-
if uploaded_file is not None:
|
55 |
-
df = process_file(uploaded_file)
|
56 |
-
|
57 |
-
st.subheader("Data Preview")
|
58 |
-
st.write(df.head())
|
59 |
-
|
60 |
-
st.subheader("Sentiment Distribution")
|
61 |
-
fig, axs = plt.subplots(2, 2, figsize=(12, 8))
|
62 |
-
fig.suptitle("Sentiment Distribution for Each Model")
|
63 |
-
|
64 |
-
models = ['VADER', 'FinBERT', 'RoBERTa', 'FinBERT-Tone']
|
65 |
-
for i, model in enumerate(models):
|
66 |
-
ax = axs[i // 2, i % 2]
|
67 |
-
sentiment_counts = df[model].value_counts()
|
68 |
-
sentiment_counts.plot(kind='bar', ax=ax)
|
69 |
-
ax.set_title(f"{model} Sentiment")
|
70 |
-
ax.set_xlabel("Sentiment")
|
71 |
-
ax.set_ylabel("Count")
|
72 |
-
|
73 |
-
plt.tight_layout()
|
74 |
-
st.pyplot(fig)
|
75 |
-
|
76 |
-
# Offer download of results
|
77 |
-
output = io.BytesIO()
|
78 |
-
with pd.ExcelWriter(output, engine='openpyxl') as writer:
|
79 |
-
df.to_excel(writer, index=False)
|
80 |
-
output.seek(0)
|
81 |
-
st.download_button(
|
82 |
-
label="Download results as Excel",
|
83 |
-
data=output,
|
84 |
-
file_name="sentiment_analysis_results.xlsx",
|
85 |
-
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
86 |
-
)
|
87 |
-
|
88 |
-
if __name__ == "__main__":
|
89 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|