pentarosarium commited on
Commit
6fe0751
·
1 Parent(s): 186bbf3
Files changed (1) hide show
  1. app.py +55 -225
app.py CHANGED
@@ -977,185 +977,110 @@ def process_file(uploaded_file, model_choice, translation_method=None):
977
  'Заголовок': '',
978
  'Выдержки из текста': '',
979
  'Translated': '',
980
- 'Sentiment': 'Neutral', # Default sentiment
981
- 'Impact': 'Неопределенный эффект', # Default impact
982
- 'Reasoning': 'Не проанализировано', # Default reasoning
983
- 'Event_Type': 'Нет', # Default event type
984
- 'Event_Summary': '' # Default event summary
985
  }
986
 
987
  # Ensure all required columns exist in DataFrame
988
  for col, default_value in required_columns.items():
989
  if col not in df.columns:
990
  df[col] = default_value
991
-
992
- # Copy all columns to processed_rows_df
993
- processed_rows_df = pd.DataFrame(columns=list(required_columns.keys()))
994
- #processed_rows_df = pd.DataFrame(columns=df.columns)
995
 
996
- # Deduplication
997
- original_count = len(df)
998
- df = df.groupby('Объект', group_keys=False).apply(
999
- lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)
1000
- ).reset_index(drop=True)
1001
- st.write(f"Из {original_count} сообщений удалено {original_count - len(df)} дубликатов.")
1002
 
1003
  # Process rows
1004
  total_rows = len(df)
1005
  processed_rows = 0
1006
 
1007
  for idx, row in df.iterrows():
1008
- # Check for stop/pause
1009
- # In process_file function, replace the stop handling section:
1010
  if st.session_state.control.is_stopped():
1011
  st.warning("Обработку остановили")
1012
  if not processed_rows_df.empty:
1013
  try:
1014
- # Ensure all required columns have values
1015
- for col, default_value in required_columns.items():
1016
- if col not in processed_rows_df.columns:
1017
- processed_rows_df[col] = default_value
1018
- else:
1019
- # Fill NaN values with defaults
1020
- processed_rows_df[col] = processed_rows_df[col].fillna(default_value)
 
 
 
 
 
 
1021
 
1022
- # Copy original file columns that might be needed
1023
- original_df = pd.read_excel(uploaded_file, sheet_name='Публикации')
1024
- for col in original_df.columns:
1025
- if col not in processed_rows_df.columns:
1026
- processed_rows_df[col] = ''
1027
-
1028
- # Create output file
1029
- output = create_output_file(processed_rows_df, uploaded_file, llm)
1030
  if output is not None:
1031
  st.download_button(
1032
- label=f"📊 Скачать результат ({len(processed_rows_df)} из {len(df)} строк)",
1033
  data=output,
1034
  file_name="partial_analysis.xlsx",
1035
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
1036
  key="partial_download"
1037
  )
1038
- else:
1039
- st.error("��е удалось создать файл с частичными результатами")
1040
  except Exception as e:
1041
- st.error(f"Ошибка при создании файла с частичными результатами: {str(e)}\n{str(type(e))}")
1042
- st.error(f"Processed rows: {len(processed_rows_df)}")
1043
 
1044
  return processed_rows_df
1045
-
1046
 
1047
  st.session_state.control.wait_if_paused()
1048
  if st.session_state.control.is_paused():
1049
- st.info("Обработка на паузе. Можно возобновить.")
1050
- if not processed_rows_df.empty: # Only offer download if we have processed rows
1051
- output = create_output_file(processed_rows_df, uploaded_file, llm)
1052
- if output is not None:
1053
- st.download_button(
1054
- label=f"📊 Скачать результат ({processed_rows} из {total_rows} строк)",
1055
- data=output,
1056
- file_name="partial_analysis.xlsx",
1057
- mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
1058
- key="partial_download"
1059
- )
1060
- break
1061
  continue
1062
 
1063
  try:
 
 
 
1064
  # Translation
1065
  translated_text = translator.translate_text(row['Выдержки из текста'])
1066
- df.at[idx, 'Translated'] = translated_text
1067
 
1068
  # Sentiment analysis
1069
  sentiment = analyze_sentiment(translated_text)
1070
- df.at[idx, 'Sentiment'] = sentiment
1071
 
1072
- # Event detection using BERT/ MT-5
1073
  event_type, event_summary = event_detector.detect_event_type(
1074
  row['Выдержки из текста'],
1075
  row['Объект']
1076
  )
1077
- df.at[idx, 'Event_Type'] = event_type
1078
- df.at[idx, 'Event_Summary'] = event_summary
1079
-
1080
-
1081
- # Show events in real-time
1082
- #if event_type != "Нет":
1083
- # ui.show_event(
1084
- # row['Объект'],
1085
- # event_type,
1086
- # row['Заголовок']
1087
- # )
1088
-
1089
- #Calculate processing speed (items per second)
1090
- current_time = time.time()
1091
-
1092
- time_delta = current_time - last_update_time
1093
- if time_delta > 0:
1094
- processing_speed = 1 / time_delta # items per second
1095
- else:
1096
- processing_speed = 0
1097
-
1098
- # Update live statistics
1099
- ui.update_stats(row, sentiment, event_type, processing_speed)
1100
-
1101
-
1102
- # Handle negative sentiment
1103
 
1104
  # Handle negative sentiment
1105
  if sentiment == "Negative":
1106
  try:
1107
- # Validate translated text
1108
  if translated_text and len(translated_text.strip()) > 0:
1109
- # Initialize Groq LLM if not already done
1110
- if 'groq_llm' not in locals():
1111
- groq_llm = ensure_groq_llm()
1112
-
1113
  impact, reasoning = estimate_impact(
1114
  groq_llm if groq_llm is not None else llm,
1115
  translated_text,
1116
  row['Объект']
1117
  )
1118
- else:
1119
- # Use original text if translation failed
1120
- original_text = row['Выдержки из текста']
1121
- if original_text and len(original_text.strip()) > 0:
1122
- impact, reasoning = estimate_impact(
1123
- groq_llm if groq_llm is not None else llm,
1124
- original_text,
1125
- row['Объект']
1126
- )
1127
- else:
1128
- impact = "Неопределенный эффект"
1129
- reasoning = "Текст новости отсутствует"
1130
- st.warning(f"Empty news text for {row['Объект']}")
1131
-
1132
  except Exception as e:
1133
- impact = "Неопределенный эффект"
1134
- reasoning = "Error in impact estimation"
1135
- st.warning(f"Impact estimation error: {str(e)}")
1136
-
1137
- # Store results
1138
- df.at[idx, 'Impact'] = impact
1139
- df.at[idx, 'Reasoning'] = reasoning
1140
-
1141
-
1142
- row_data = {col: row.get(col, default_val) for col, default_val in required_columns.items()}
1143
- processed_rows_df = pd.concat([processed_rows_df, pd.DataFrame([row_data])], ignore_index=True)
1144
- #processed_rows_df = pd.concat([processed_rows_df, df.iloc[[idx]]], ignore_index=True)
1145
-
1146
  # Update progress
1147
  processed_rows += 1
1148
  ui.update_progress(processed_rows, total_rows)
1149
- last_update_time = current_time
1150
-
1151
  except Exception as e:
1152
  st.warning(f"Ошибка в обработке ряда {idx + 1}: {str(e)}")
1153
  continue
1154
-
1155
- time.sleep(0.1)
1156
-
1157
 
1158
- return processed_rows_df if st.session_state.control.is_stopped() else df
1159
 
1160
  except Exception as e:
1161
  st.error(f"Ошибка в обработке файла: {str(e)}")
@@ -1481,129 +1406,34 @@ def translate_reasoning_to_russian(llm, text):
1481
  else:
1482
  return str(response).strip()
1483
 
 
1484
  def create_output_file(df, uploaded_file, llm):
 
1485
  try:
1486
  wb = load_workbook("sample_file.xlsx")
1487
 
1488
- # Update 'Мониторинг' sheet with events
1489
- ws = wb['Мониторинг']
1490
- row_idx = 4
1491
- events_df = df[df['Event_Type'] != 'Нет'].copy()
1492
- for _, row in events_df.iterrows():
1493
- ws.cell(row=row_idx, column=5, value=row['Объект'])
1494
- ws.cell(row=row_idx, column=6, value=row['Заголовок'])
1495
- ws.cell(row=row_idx, column=7, value=row['Event_Type'])
1496
- ws.cell(row=row_idx, column=8, value=row['Event_Summary'])
1497
- ws.cell(row=row_idx, column=9, value=row['Выдержки из текста'])
1498
- row_idx += 1
1499
-
1500
- # Calculate statistics safely
1501
- try:
1502
- entity_stats = pd.DataFrame({
1503
- 'Объект': df['Объект'].unique(),
1504
- 'Всего': df.groupby('Объект').size(),
1505
- 'Негативные': df[df['Sentiment'] == 'Negative'].groupby('Объект').size().fillna(0).astype(int),
1506
- 'Позитивные': df[df['Sentiment'] == 'Positive'].groupby('Объект').size().fillna(0).astype(int)
1507
- }).sort_values('Негативные', ascending=False)
1508
- except Exception as e:
1509
- st.warning(f"Error calculating entity stats: {str(e)}")
1510
- entity_stats = pd.DataFrame(columns=['Объект', 'Всего', 'Негативные', 'Позитивные'])
1511
 
1512
- # Calculate impacts safely
1513
- entity_impacts = {}
1514
- for entity in df['Объект'].unique():
1515
- try:
1516
- entity_df = df[df['Объект'] == entity]
1517
- negative_df = entity_df[entity_df['Sentiment'] == 'Negative']
1518
- if len(negative_df) > 0 and 'Impact' in negative_df.columns:
1519
- impacts = negative_df['Impact'].dropna()
1520
- entity_impacts[entity] = impacts.iloc[0] if len(impacts) > 0 else 'Неопределенный эффект'
1521
- else:
1522
- entity_impacts[entity] = 'Неопределенный эффект'
1523
- except Exception as e:
1524
- st.warning(f"Error calculating impact for {entity}: {str(e)}")
1525
- entity_impacts[entity] = 'Неопределенный эффект'
1526
-
1527
- # Update 'Сводка' sheet
1528
- ws = wb['Сводка']
1529
- for idx, (entity, row) in enumerate(entity_stats.iterrows(), start=4):
1530
- ws.cell(row=idx, column=5, value=entity)
1531
- ws.cell(row=idx, column=6, value=row['Всего'])
1532
- ws.cell(row=idx, column=7, value=row['Негативные'])
1533
- ws.cell(row=idx, column=8, value=row['Позитивные'])
1534
- ws.cell(row=idx, column=9, value=entity_impacts.get(entity, 'Неопределенный эффект'))
1535
-
1536
- # Update 'Значимые' sheet with both negative and positive
1537
- ws = wb['Значимые']
1538
- row_idx = 3
1539
- sentiment_df = df[df['Sentiment'].isin(['Negative', 'Positive'])].copy()
1540
- for _, row in sentiment_df.iterrows():
1541
- cols = ['Объект', 'Заголовок', 'Sentiment', 'Impact', 'Выдержки из текста']
1542
- for col in cols:
1543
- if col not in row:
1544
- row[col] = '' # Handle missing columns
1545
-
1546
- ws.cell(row=row_idx, column=3, value=row['Объект'])
1547
- ws.cell(row=row_idx, column=4, value='релевантно')
1548
- ws.cell(row=row_idx, column=5, value=row['Sentiment'])
1549
- ws.cell(row=row_idx, column=6, value=row.get('Impact', ''))
1550
- ws.cell(row=row_idx, column=7, value=row['Заголовок'])
1551
- ws.cell(row=row_idx, column=8, value=row['Выдержки из текста'])
1552
- row_idx += 1
1553
-
1554
- # Copy processed rows to 'Публикации' sheet
1555
- ws = wb['Публикации']
1556
- for r_idx, row in enumerate(dataframe_to_rows(df, index=False, header=True), start=1):
1557
- for c_idx, value in enumerate(row, start=1):
1558
- ws.cell(row=r_idx, column=c_idx, value=value)
1559
-
1560
- # Update 'Анализ' sheet safely
1561
- ws = wb['Анализ']
1562
- row_idx = 4
1563
- negative_df = df[df['Sentiment'] == 'Negative'].copy()
1564
- for _, row in negative_df.iterrows():
1565
- ws.cell(row=row_idx, column=5, value=row['Объект'])
1566
- ws.cell(row=row_idx, column=6, value=row['Заголовок'])
1567
- ws.cell(row=row_idx, column=7, value="Риск убытка")
1568
-
1569
- reasoning = row.get('Reasoning', '')
1570
- if reasoning and pd.notna(reasoning):
1571
- try:
1572
- grlm = init_langchain_llm("Groq (llama-3.1-70b)")
1573
- translated_reasoning = translate_reasoning_to_russian(grlm, reasoning)
1574
- ws.cell(row=row_idx, column=8, value=translated_reasoning)
1575
- except Exception as e:
1576
- ws.cell(row=row_idx, column=8, value=reasoning)
1577
-
1578
- ws.cell(row=row_idx, column=9, value=row['Выдержки из текста'])
1579
- row_idx += 1
1580
-
1581
- # Update 'Тех.приложение' sheet
1582
- tech_cols = ['Объект', 'Заголовок', 'Выдержки из текста', 'Translated', 'Sentiment', 'Impact', 'Reasoning']
1583
- tech_df = df[[col for col in tech_cols if col in df.columns]].copy()
1584
-
1585
- if 'Тех.приложение' not in wb.sheetnames:
1586
- wb.create_sheet('Тех.приложение')
1587
- ws = wb['Тех.приложение']
1588
-
1589
- for r_idx, row in enumerate(dataframe_to_rows(tech_df, index=False, header=True), start=1):
1590
- for c_idx, value in enumerate(row, start=1):
1591
- ws.cell(row=r_idx, column=c_idx, value=value)
1592
-
1593
  output = io.BytesIO()
1594
  wb.save(output)
1595
  output.seek(0)
1596
  return output
1597
-
1598
  except Exception as e:
1599
- st.error(f"Error creating output file: {str(e)}")
1600
  return None
1601
 
 
1602
  def main():
1603
  st.set_page_config(layout="wide")
1604
 
1605
  with st.sidebar:
1606
- st.title("::: AI-анализ мониторинга новостей (v.4.8):::")
1607
  st.subheader("по материалам СКАН-ИНТЕРФАКС")
1608
 
1609
  model_choice = st.radio(
@@ -1635,7 +1465,7 @@ def main():
1635
  .signature {
1636
  position: fixed;
1637
  right: 12px;
1638
- up: 12px;
1639
  font-size: 14px;
1640
  color: #FF0000;
1641
  opacity: 0.9;
 
977
  'Заголовок': '',
978
  'Выдержки из текста': '',
979
  'Translated': '',
980
+ 'Sentiment': 'Neutral',
981
+ 'Impact': 'Неопределенный эффект',
982
+ 'Reasoning': 'Не проанализировано',
983
+ 'Event_Type': 'Нет',
984
+ 'Event_Summary': ''
985
  }
986
 
987
  # Ensure all required columns exist in DataFrame
988
  for col, default_value in required_columns.items():
989
  if col not in df.columns:
990
  df[col] = default_value
 
 
 
 
991
 
992
+ # Create processed_rows_df with all columns from original df and required columns
993
+ all_columns = list(set(list(df.columns) + list(required_columns.keys())))
994
+ processed_rows_df = pd.DataFrame(columns=all_columns)
 
 
 
995
 
996
  # Process rows
997
  total_rows = len(df)
998
  processed_rows = 0
999
 
1000
  for idx, row in df.iterrows():
 
 
1001
  if st.session_state.control.is_stopped():
1002
  st.warning("Обработку остановили")
1003
  if not processed_rows_df.empty:
1004
  try:
1005
+ # Create the output files for each sheet
1006
+ monitoring_df = processed_rows_df[processed_rows_df['Event_Type'] != 'Нет'].copy()
1007
+ svodka_df = processed_rows_df.groupby('Объект').agg({
1008
+ 'Объект': 'first',
1009
+ 'Sentiment': lambda x: sum(x == 'Negative'),
1010
+ 'Event_Type': lambda x: sum(x != 'Нет')
1011
+ }).reset_index()
1012
+
1013
+ # Prepare final DataFrame for file creation
1014
+ result_df = pd.DataFrame()
1015
+ result_df['Мониторинг'] = monitoring_df.to_dict('records')
1016
+ result_df['Сводка'] = svodka_df.to_dict('records')
1017
+ result_df['Публикации'] = processed_rows_df.to_dict('records')
1018
 
1019
+ output = create_output_file(result_df, uploaded_file, llm)
 
 
 
 
 
 
 
1020
  if output is not None:
1021
  st.download_button(
1022
+ label=f"📊 Скачать результат ({processed_rows} из {total_rows} строк)",
1023
  data=output,
1024
  file_name="partial_analysis.xlsx",
1025
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
1026
  key="partial_download"
1027
  )
 
 
1028
  except Exception as e:
1029
+ st.error(f"Ошибка при создании файла: {str(e)}")
 
1030
 
1031
  return processed_rows_df
 
1032
 
1033
  st.session_state.control.wait_if_paused()
1034
  if st.session_state.control.is_paused():
 
 
 
 
 
 
 
 
 
 
 
 
1035
  continue
1036
 
1037
  try:
1038
+ # Copy original row data
1039
+ new_row = row.copy()
1040
+
1041
  # Translation
1042
  translated_text = translator.translate_text(row['Выдержки из текста'])
1043
+ new_row['Translated'] = translated_text
1044
 
1045
  # Sentiment analysis
1046
  sentiment = analyze_sentiment(translated_text)
1047
+ new_row['Sentiment'] = sentiment
1048
 
1049
+ # Event detection
1050
  event_type, event_summary = event_detector.detect_event_type(
1051
  row['Выдержки из текста'],
1052
  row['Объект']
1053
  )
1054
+ new_row['Event_Type'] = event_type
1055
+ new_row['Event_Summary'] = event_summary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1056
 
1057
  # Handle negative sentiment
1058
  if sentiment == "Negative":
1059
  try:
 
1060
  if translated_text and len(translated_text.strip()) > 0:
 
 
 
 
1061
  impact, reasoning = estimate_impact(
1062
  groq_llm if groq_llm is not None else llm,
1063
  translated_text,
1064
  row['Объект']
1065
  )
1066
+ new_row['Impact'] = impact
1067
+ new_row['Reasoning'] = reasoning
 
 
 
 
 
 
 
 
 
 
 
 
1068
  except Exception as e:
1069
+ new_row['Impact'] = "Неопределенный эффект"
1070
+ new_row['Reasoning'] = "Ошибка анализа"
1071
+
1072
+ # Add processed row to DataFrame
1073
+ processed_rows_df = pd.concat([processed_rows_df, pd.DataFrame([new_row])], ignore_index=True)
1074
+
 
 
 
 
 
 
 
1075
  # Update progress
1076
  processed_rows += 1
1077
  ui.update_progress(processed_rows, total_rows)
1078
+
 
1079
  except Exception as e:
1080
  st.warning(f"Ошибка в обработке ряда {idx + 1}: {str(e)}")
1081
  continue
 
 
 
1082
 
1083
+ return processed_rows_df
1084
 
1085
  except Exception as e:
1086
  st.error(f"Ошибка в обработке файла: {str(e)}")
 
1406
  else:
1407
  return str(response).strip()
1408
 
1409
+
1410
  def create_output_file(df, uploaded_file, llm):
1411
+ """Simple function to write prepared DataFrame to Excel file"""
1412
  try:
1413
  wb = load_workbook("sample_file.xlsx")
1414
 
1415
+ # Copy all sheets from processed DataFrame
1416
+ for sheet_name in wb.sheetnames:
1417
+ ws = wb[sheet_name]
1418
+ if sheet_name == 'Публикации':
1419
+ for r_idx, row in enumerate(dataframe_to_rows(df, index=False, header=True), start=1):
1420
+ for c_idx, value in enumerate(row, start=1):
1421
+ ws.cell(row=r_idx, column=c_idx, value=value)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1422
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1423
  output = io.BytesIO()
1424
  wb.save(output)
1425
  output.seek(0)
1426
  return output
 
1427
  except Exception as e:
1428
+ st.error(f"Error saving file: {str(e)}")
1429
  return None
1430
 
1431
+
1432
  def main():
1433
  st.set_page_config(layout="wide")
1434
 
1435
  with st.sidebar:
1436
+ st.title("::: AI-анализ мониторинга новостей (v.4.9):::")
1437
  st.subheader("по материалам СКАН-ИНТЕРФАКС")
1438
 
1439
  model_choice = st.radio(
 
1465
  .signature {
1466
  position: fixed;
1467
  right: 12px;
1468
+ down: 12px;
1469
  font-size: 14px;
1470
  color: #FF0000;
1471
  opacity: 0.9;