pentarosarium commited on
Commit
4673e91
·
1 Parent(s): bc1927c

progress more 31+

Browse files
Files changed (2) hide show
  1. app.py +74 -10
  2. sample_file.xlsx +0 -0
app.py CHANGED
@@ -3,7 +3,6 @@ import pandas as pd
3
  import time
4
  from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
5
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
6
- #from transformers import MarianMTModel, MarianTokenizer
7
  import matplotlib.pyplot as plt
8
  from pymystem3 import Mystem
9
  import io
@@ -11,6 +10,7 @@ from rapidfuzz import fuzz
11
  from tqdm.auto import tqdm
12
  import time
13
  import torch
 
14
 
15
  # Initialize pymystem3 for lemmatization
16
  mystem = Mystem()
@@ -107,6 +107,7 @@ def fuzzy_deduplicate(df, column, threshold=65):
107
 
108
 
109
  def process_file(uploaded_file):
 
110
  df = pd.read_excel(uploaded_file, sheet_name='Публикации')
111
 
112
  original_news_count = len(df)
@@ -162,8 +163,75 @@ def process_file(uploaded_file):
162
 
163
  return df
164
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  def main():
166
- st.title("... приступим к анализу... версия 30")
167
 
168
  uploaded_file = st.file_uploader("Выбирайте Excel-файл", type="xlsx")
169
 
@@ -177,7 +245,7 @@ def main():
177
  fig, axs = plt.subplots(2, 2, figsize=(12, 8))
178
  fig.suptitle("Распределение окраски по моделям")
179
 
180
- models = ['ruBERT1', 'ruBERT2','FinBERT', 'RoBERTa', 'FinBERT-Tone']
181
  for i, model in enumerate(models):
182
  ax = axs[i // 2, i % 2]
183
  sentiment_counts = df[model].value_counts()
@@ -190,16 +258,12 @@ def main():
190
  st.pyplot(fig)
191
 
192
  # Offer download of results
193
- output = io.BytesIO()
194
- with pd.ExcelWriter(output, engine='openpyxl') as writer:
195
- df.to_excel(writer, index=False)
196
- output.seek(0)
197
  st.download_button(
198
- label="Хотите загрузить результат? Вот он",
199
  data=output,
200
- file_name="sentiment_analysis_results.xlsx",
201
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
202
  )
203
-
204
  if __name__ == "__main__":
205
  main()
 
3
  import time
4
  from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
5
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 
6
  import matplotlib.pyplot as plt
7
  from pymystem3 import Mystem
8
  import io
 
10
  from tqdm.auto import tqdm
11
  import time
12
  import torch
13
+ from openpyxl import load_workbook
14
 
15
  # Initialize pymystem3 for lemmatization
16
  mystem = Mystem()
 
107
 
108
 
109
  def process_file(uploaded_file):
110
+
111
  df = pd.read_excel(uploaded_file, sheet_name='Публикации')
112
 
113
  original_news_count = len(df)
 
163
 
164
  return df
165
 
166
+ def create_output_file(df):
167
+ # Create a new Excel writer object
168
+ output = io.BytesIO()
169
+ writer = pd.ExcelWriter(output, engine='openpyxl')
170
+
171
+ # Load the sample file to copy its structure
172
+ sample_wb = load_workbook("sample_file.xlsx")
173
+
174
+ # Process data for 'Сводка' sheet
175
+ entities = df['Объект'].unique()
176
+ summary_data = []
177
+ for entity in entities:
178
+ entity_df = df[df['Объект'] == entity]
179
+ total_news = len(entity_df)
180
+ negative_news = sum((entity_df['FinBERT'] == 'Negative') |
181
+ (entity_df['RoBERTa'] == 'Negative') |
182
+ (entity_df['FinBERT-Tone'] == 'Negative'))
183
+ positive_news = sum((entity_df['FinBERT'] == 'Positive') |
184
+ (entity_df['RoBERTa'] == 'Positive') |
185
+ (entity_df['FinBERT-Tone'] == 'Positive'))
186
+ summary_data.append([entity, total_news, negative_news, positive_news])
187
+
188
+ summary_df = pd.DataFrame(summary_data, columns=['Объект', 'Всего новостей', 'Отрицательные', 'Положительные'])
189
+ summary_df = summary_df.sort_values('Отрицательные', ascending=False)
190
+
191
+ # Write 'Сводка' sheet
192
+ if 'Сводка' in sample_wb.sheetnames:
193
+ writer.book['Сводка'] = sample_wb['Сводка']
194
+ summary_df.to_excel(writer, sheet_name='Сводка', startrow=3, startcol=4, index=False, header=False)
195
+
196
+ # Process data for 'Значимые' and 'Анализ' sheets
197
+ significant_data = []
198
+ analysis_data = []
199
+ for _, row in df.iterrows():
200
+ if any(row[model] in ['Negative', 'Positive'] for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
201
+ sentiment = 'Negative' if any(row[model] == 'Negative' for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']) else 'Positive'
202
+ significant_data.append([row['Объект'], sentiment, row['Заголовок'], row['Выдержки из текста']])
203
+
204
+ if any(row[model] == 'Negative' for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
205
+ analysis_data.append([row['Объект'], 'РИСК УБЫТКА', row['Заголовок'], row['Выдержки из текста']])
206
+
207
+ # Write 'Значимые' sheet
208
+ if 'Значимые' in sample_wb.sheetnames:
209
+ writer.book['Значимые'] = sample_wb['Значимые']
210
+ significant_df = pd.DataFrame(significant_data, columns=['Объект', 'Окраска', 'Заголовок', 'Текст'])
211
+ significant_df.to_excel(writer, sheet_name='Значимые', startrow=2, startcol=2, index=False)
212
+
213
+ # Write 'Анализ' sheet
214
+ if 'Анализ' in sample_wb.sheetnames:
215
+ writer.book['Анализ'] = sample_wb['Анализ']
216
+ analysis_df = pd.DataFrame(analysis_data, columns=['Объект', 'Тип риска', 'Заголовок', 'Текст'])
217
+ analysis_df.to_excel(writer, sheet_name='Анализ', startrow=3, startcol=4, index=False)
218
+
219
+ # Copy 'Публикации' sheet from original file
220
+ if 'Публикации' in sample_wb.sheetnames:
221
+ writer.book['Публикации'] = sample_wb['Публикации']
222
+ df.to_excel(writer, sheet_name='Публикации', index=False)
223
+
224
+ # Add 'Тех.приложение' sheet
225
+ df.to_excel(writer, sheet_name='Тех.приложение', index=False)
226
+
227
+ writer.save()
228
+ output.seek(0)
229
+
230
+ return output
231
+
232
+
233
  def main():
234
+ st.title("... приступим к анализу... версия 31+")
235
 
236
  uploaded_file = st.file_uploader("Выбирайте Excel-файл", type="xlsx")
237
 
 
245
  fig, axs = plt.subplots(2, 2, figsize=(12, 8))
246
  fig.suptitle("Распределение окраски по моделям")
247
 
248
+ models = ['ruBERT2','FinBERT', 'RoBERTa', 'FinBERT-Tone']
249
  for i, model in enumerate(models):
250
  ax = axs[i // 2, i % 2]
251
  sentiment_counts = df[model].value_counts()
 
258
  st.pyplot(fig)
259
 
260
  # Offer download of results
261
+ output = create_output_file(df)
 
 
 
262
  st.download_button(
263
+ label="Скачать результат анализа новостей",
264
  data=output,
265
+ file_name="результат_анализа_новостей.xlsx",
266
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
267
  )
 
268
  if __name__ == "__main__":
269
  main()
sample_file.xlsx ADDED
Binary file (134 kB). View file