pentarosarium commited on
Commit
d851af8
·
1 Parent(s): cd09d80

all defs in order

Browse files
Files changed (2) hide show
  1. app.py +66 -4
  2. hf-streamlit-app.py +0 -89
app.py CHANGED
@@ -9,12 +9,74 @@ from pymystem3 import Mystem
9
  import io
10
  from rapidfuzz import fuzz
11
 
12
- # Initialize components (VADER, FinBERT, RoBERTa, FinBERT-Tone, Mystem, translation model)
 
13
 
14
- # (Copy the initialization code from your original script)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- # Define helper functions (lemmatize_text, translate, get_vader_sentiment...)
17
- # (Copy these functions from your original script)
18
 
19
  def process_file(uploaded_file):
20
  df = pd.read_excel(uploaded_file, sheet_name='Публикации')
 
9
  import io
10
  from rapidfuzz import fuzz
11
 
12
+ # Initialize pymystem3 for lemmatization
13
+ mystem = Mystem()
14
 
15
+ # Set up the sentiment analyzers
16
+ vader_analyzer = SentimentIntensityAnalyzer()
17
+ finbert = pipeline("sentiment-analysis", model="ProsusAI/finbert")
18
+ roberta = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")
19
+ finbert_tone = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone")
20
+
21
+ # Function for lemmatizing Russian text
22
+ def lemmatize_text(text):
23
+ lemmatized_text = ''.join(mystem.lemmatize(text))
24
+ return lemmatized_text
25
+
26
+ # Translation model for Russian to English
27
+ model_name = "Helsinki-NLP/opus-mt-ru-en"
28
+ translation_tokenizer = MarianTokenizer.from_pretrained(model_name)
29
+ translation_model = MarianMTModel.from_pretrained(model_name)
30
+
31
+ def translate(text):
32
+ inputs = translation_tokenizer(text, return_tensors="pt", truncation=True)
33
+ translated_tokens = translation_model.generate(**inputs)
34
+ return translation_tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
35
+
36
+ # Function for VADER sentiment analysis with label mapping
37
+ def get_vader_sentiment(text):
38
+ score = vader_analyzer.polarity_scores(text)["compound"]
39
+ if score > 0.2:
40
+ return "Positive"
41
+ elif score < -0.2:
42
+ return "Negative"
43
+ return "Neutral"
44
+
45
+ # Functions for FinBERT, RoBERTa, and FinBERT-Tone with label mapping
46
+ def get_mapped_sentiment(result):
47
+ label = result['label'].lower()
48
+ if label in ["positive", "label_2", "pos", "pos_label"]:
49
+ return "Positive"
50
+ elif label in ["negative", "label_0", "neg", "neg_label"]:
51
+ return "Negative"
52
+ return "Neutral"
53
+
54
+ def get_finbert_sentiment(text):
55
+ result = finbert(text, truncation=True, max_length=512)[0]
56
+ return get_mapped_sentiment(result)
57
+
58
+ def get_roberta_sentiment(text):
59
+ result = roberta(text, truncation=True, max_length=512)[0]
60
+ return get_mapped_sentiment(result)
61
+
62
+ def get_finbert_tone_sentiment(text):
63
+ result = finbert_tone(text, truncation=True, max_length=512)[0]
64
+ return get_mapped_sentiment(result)
65
+
66
+ #Fuzzy filter out similar news for the same NER
67
+ def fuzzy_deduplicate(df, column, threshold=65):
68
+ seen_texts = []
69
+ indices_to_keep = []
70
+ for i, text in enumerate(df[column]):
71
+ if pd.isna(text):
72
+ indices_to_keep.append(i)
73
+ continue
74
+ text = str(text)
75
+ if not seen_texts or all(fuzz.ratio(text, seen) < threshold for seen in seen_texts):
76
+ seen_texts.append(text)
77
+ indices_to_keep.append(i)
78
+ return df.iloc[indices_to_keep]
79
 
 
 
80
 
81
  def process_file(uploaded_file):
82
  df = pd.read_excel(uploaded_file, sheet_name='Публикации')
hf-streamlit-app.py DELETED
@@ -1,89 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import time
4
- from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
5
- from transformers import pipeline, MarianMTModel, MarianTokenizer
6
- import matplotlib.pyplot as plt
7
- from pymystem3 import Mystem
8
- import io
9
- from rapidfuzz import fuzz
10
-
11
- # Initialize components (VADER, FinBERT, RoBERTa, FinBERT-Tone, Mystem, translation model)
12
- # (Copy the initialization code from your original script)
13
-
14
- # Define helper functions (lemmatize_text, translate, get_vader_sentiment, etc.)
15
- # (Copy these functions from your original script)
16
-
17
- def process_file(uploaded_file):
18
- df = pd.read_excel(uploaded_file, sheet_name='Публикации')
19
-
20
- # Apply fuzzy deduplication
21
- df = df.groupby('Объект').apply(lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)).reset_index(drop=True)
22
-
23
- # Translate texts
24
- translated_texts = []
25
- progress_bar = st.progress(0)
26
- for i, text in enumerate(df['Выдержки из текста']):
27
- translated_text = translate(str(text))
28
- translated_texts.append(translated_text)
29
- progress_bar.progress((i + 1) / len(df))
30
-
31
- # Perform sentiment analysis
32
- vader_results = [get_vader_sentiment(text) for text in translated_texts]
33
- finbert_results = [get_finbert_sentiment(text) for text in translated_texts]
34
- roberta_results = [get_roberta_sentiment(text) for text in translated_texts]
35
- finbert_tone_results = [get_finbert_tone_sentiment(text) for text in translated_texts]
36
-
37
- # Add results to DataFrame
38
- df['VADER'] = vader_results
39
- df['FinBERT'] = finbert_results
40
- df['RoBERTa'] = roberta_results
41
- df['FinBERT-Tone'] = finbert_tone_results
42
-
43
- # Reorder columns
44
- columns_order = ['Объект', 'VADER', 'FinBERT', 'RoBERTa', 'FinBERT-Tone', 'Выдержки из текста']
45
- df = df[columns_order]
46
-
47
- return df
48
-
49
- def main():
50
- st.title("Sentiment Analysis App")
51
-
52
- uploaded_file = st.file_uploader("Choose an Excel file", type="xlsx")
53
-
54
- if uploaded_file is not None:
55
- df = process_file(uploaded_file)
56
-
57
- st.subheader("Data Preview")
58
- st.write(df.head())
59
-
60
- st.subheader("Sentiment Distribution")
61
- fig, axs = plt.subplots(2, 2, figsize=(12, 8))
62
- fig.suptitle("Sentiment Distribution for Each Model")
63
-
64
- models = ['VADER', 'FinBERT', 'RoBERTa', 'FinBERT-Tone']
65
- for i, model in enumerate(models):
66
- ax = axs[i // 2, i % 2]
67
- sentiment_counts = df[model].value_counts()
68
- sentiment_counts.plot(kind='bar', ax=ax)
69
- ax.set_title(f"{model} Sentiment")
70
- ax.set_xlabel("Sentiment")
71
- ax.set_ylabel("Count")
72
-
73
- plt.tight_layout()
74
- st.pyplot(fig)
75
-
76
- # Offer download of results
77
- output = io.BytesIO()
78
- with pd.ExcelWriter(output, engine='openpyxl') as writer:
79
- df.to_excel(writer, index=False)
80
- output.seek(0)
81
- st.download_button(
82
- label="Download results as Excel",
83
- data=output,
84
- file_name="sentiment_analysis_results.xlsx",
85
- mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
86
- )
87
-
88
- if __name__ == "__main__":
89
- main()