Spaces:
Running
Running
Commit
·
4673e91
1
Parent(s):
bc1927c
progress more 31+
Browse files- app.py +74 -10
- sample_file.xlsx +0 -0
app.py
CHANGED
@@ -3,7 +3,6 @@ import pandas as pd
|
|
3 |
import time
|
4 |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
5 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
6 |
-
#from transformers import MarianMTModel, MarianTokenizer
|
7 |
import matplotlib.pyplot as plt
|
8 |
from pymystem3 import Mystem
|
9 |
import io
|
@@ -11,6 +10,7 @@ from rapidfuzz import fuzz
|
|
11 |
from tqdm.auto import tqdm
|
12 |
import time
|
13 |
import torch
|
|
|
14 |
|
15 |
# Initialize pymystem3 for lemmatization
|
16 |
mystem = Mystem()
|
@@ -107,6 +107,7 @@ def fuzzy_deduplicate(df, column, threshold=65):
|
|
107 |
|
108 |
|
109 |
def process_file(uploaded_file):
|
|
|
110 |
df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
111 |
|
112 |
original_news_count = len(df)
|
@@ -162,8 +163,75 @@ def process_file(uploaded_file):
|
|
162 |
|
163 |
return df
|
164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
def main():
|
166 |
-
st.title("... приступим к анализу... версия
|
167 |
|
168 |
uploaded_file = st.file_uploader("Выбирайте Excel-файл", type="xlsx")
|
169 |
|
@@ -177,7 +245,7 @@ def main():
|
|
177 |
fig, axs = plt.subplots(2, 2, figsize=(12, 8))
|
178 |
fig.suptitle("Распределение окраски по моделям")
|
179 |
|
180 |
-
models = ['
|
181 |
for i, model in enumerate(models):
|
182 |
ax = axs[i // 2, i % 2]
|
183 |
sentiment_counts = df[model].value_counts()
|
@@ -190,16 +258,12 @@ def main():
|
|
190 |
st.pyplot(fig)
|
191 |
|
192 |
# Offer download of results
|
193 |
-
output =
|
194 |
-
with pd.ExcelWriter(output, engine='openpyxl') as writer:
|
195 |
-
df.to_excel(writer, index=False)
|
196 |
-
output.seek(0)
|
197 |
st.download_button(
|
198 |
-
label="
|
199 |
data=output,
|
200 |
-
file_name="
|
201 |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
202 |
)
|
203 |
-
|
204 |
if __name__ == "__main__":
|
205 |
main()
|
|
|
3 |
import time
|
4 |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
5 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
|
|
6 |
import matplotlib.pyplot as plt
|
7 |
from pymystem3 import Mystem
|
8 |
import io
|
|
|
10 |
from tqdm.auto import tqdm
|
11 |
import time
|
12 |
import torch
|
13 |
+
from openpyxl import load_workbook
|
14 |
|
15 |
# Initialize pymystem3 for lemmatization
|
16 |
mystem = Mystem()
|
|
|
107 |
|
108 |
|
109 |
def process_file(uploaded_file):
|
110 |
+
|
111 |
df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
112 |
|
113 |
original_news_count = len(df)
|
|
|
163 |
|
164 |
return df
|
165 |
|
166 |
+
def create_output_file(df):
|
167 |
+
# Create a new Excel writer object
|
168 |
+
output = io.BytesIO()
|
169 |
+
writer = pd.ExcelWriter(output, engine='openpyxl')
|
170 |
+
|
171 |
+
# Load the sample file to copy its structure
|
172 |
+
sample_wb = load_workbook("sample_file.xlsx")
|
173 |
+
|
174 |
+
# Process data for 'Сводка' sheet
|
175 |
+
entities = df['Объект'].unique()
|
176 |
+
summary_data = []
|
177 |
+
for entity in entities:
|
178 |
+
entity_df = df[df['Объект'] == entity]
|
179 |
+
total_news = len(entity_df)
|
180 |
+
negative_news = sum((entity_df['FinBERT'] == 'Negative') |
|
181 |
+
(entity_df['RoBERTa'] == 'Negative') |
|
182 |
+
(entity_df['FinBERT-Tone'] == 'Negative'))
|
183 |
+
positive_news = sum((entity_df['FinBERT'] == 'Positive') |
|
184 |
+
(entity_df['RoBERTa'] == 'Positive') |
|
185 |
+
(entity_df['FinBERT-Tone'] == 'Positive'))
|
186 |
+
summary_data.append([entity, total_news, negative_news, positive_news])
|
187 |
+
|
188 |
+
summary_df = pd.DataFrame(summary_data, columns=['Объект', 'Всего новостей', 'Отрицательные', 'Положительные'])
|
189 |
+
summary_df = summary_df.sort_values('Отрицательные', ascending=False)
|
190 |
+
|
191 |
+
# Write 'Сводка' sheet
|
192 |
+
if 'Сводка' in sample_wb.sheetnames:
|
193 |
+
writer.book['Сводка'] = sample_wb['Сводка']
|
194 |
+
summary_df.to_excel(writer, sheet_name='Сводка', startrow=3, startcol=4, index=False, header=False)
|
195 |
+
|
196 |
+
# Process data for 'Значимые' and 'Анализ' sheets
|
197 |
+
significant_data = []
|
198 |
+
analysis_data = []
|
199 |
+
for _, row in df.iterrows():
|
200 |
+
if any(row[model] in ['Negative', 'Positive'] for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
|
201 |
+
sentiment = 'Negative' if any(row[model] == 'Negative' for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']) else 'Positive'
|
202 |
+
significant_data.append([row['Объект'], sentiment, row['Заголовок'], row['Выдержки из текста']])
|
203 |
+
|
204 |
+
if any(row[model] == 'Negative' for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
|
205 |
+
analysis_data.append([row['Объект'], 'РИСК УБЫТКА', row['Заголовок'], row['Выдержки из текста']])
|
206 |
+
|
207 |
+
# Write 'Значимые' sheet
|
208 |
+
if 'Значимые' in sample_wb.sheetnames:
|
209 |
+
writer.book['Значимые'] = sample_wb['Значимые']
|
210 |
+
significant_df = pd.DataFrame(significant_data, columns=['Объект', 'Окраска', 'Заголовок', 'Текст'])
|
211 |
+
significant_df.to_excel(writer, sheet_name='Значимые', startrow=2, startcol=2, index=False)
|
212 |
+
|
213 |
+
# Write 'Анализ' sheet
|
214 |
+
if 'Анализ' in sample_wb.sheetnames:
|
215 |
+
writer.book['Анализ'] = sample_wb['Анализ']
|
216 |
+
analysis_df = pd.DataFrame(analysis_data, columns=['Объект', 'Тип риска', 'Заголовок', 'Текст'])
|
217 |
+
analysis_df.to_excel(writer, sheet_name='Анализ', startrow=3, startcol=4, index=False)
|
218 |
+
|
219 |
+
# Copy 'Публикации' sheet from original file
|
220 |
+
if 'Публикации' in sample_wb.sheetnames:
|
221 |
+
writer.book['Публикации'] = sample_wb['Публикации']
|
222 |
+
df.to_excel(writer, sheet_name='Публикации', index=False)
|
223 |
+
|
224 |
+
# Add 'Тех.приложение' sheet
|
225 |
+
df.to_excel(writer, sheet_name='Тех.приложение', index=False)
|
226 |
+
|
227 |
+
writer.save()
|
228 |
+
output.seek(0)
|
229 |
+
|
230 |
+
return output
|
231 |
+
|
232 |
+
|
233 |
def main():
|
234 |
+
st.title("... приступим к анализу... версия 31+")
|
235 |
|
236 |
uploaded_file = st.file_uploader("Выбирайте Excel-файл", type="xlsx")
|
237 |
|
|
|
245 |
fig, axs = plt.subplots(2, 2, figsize=(12, 8))
|
246 |
fig.suptitle("Распределение окраски по моделям")
|
247 |
|
248 |
+
models = ['ruBERT2','FinBERT', 'RoBERTa', 'FinBERT-Tone']
|
249 |
for i, model in enumerate(models):
|
250 |
ax = axs[i // 2, i % 2]
|
251 |
sentiment_counts = df[model].value_counts()
|
|
|
258 |
st.pyplot(fig)
|
259 |
|
260 |
# Offer download of results
|
261 |
+
output = create_output_file(df)
|
|
|
|
|
|
|
262 |
st.download_button(
|
263 |
+
label="Скачать результат анализа новостей",
|
264 |
data=output,
|
265 |
+
file_name="результат_анализа_новостей.xlsx",
|
266 |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
267 |
)
|
|
|
268 |
if __name__ == "__main__":
|
269 |
main()
|
sample_file.xlsx
ADDED
Binary file (134 kB). View file
|
|