pentarosarium commited on
Commit
051d547
·
1 Parent(s): e6534df
Files changed (1) hide show
  1. app.py +118 -116
app.py CHANGED
@@ -645,7 +645,7 @@ def create_interface():
645
  control = ProcessControl()
646
 
647
  with gr.Blocks(theme=gr.themes.Soft()) as app:
648
- gr.Markdown("# AI-анализ мониторинга новостей v.1.32")
649
 
650
  with gr.Row():
651
  file_input = gr.File(
@@ -704,128 +704,130 @@ def create_interface():
704
  control.request_stop()
705
  return "Остановка обработки..."
706
 
707
- @spaces.GPU(duration=300)
708
- def analyze(file_bytes):
709
- if file_bytes is None:
710
- gr.Warning("Пожалуйста, загрузите файл")
711
- return None, None, None, False, None, "Ожидание файла...", ""
712
-
713
- try:
714
- # Reset stop flag
715
- control.reset()
716
-
717
- file_obj = io.BytesIO(file_bytes)
718
- logger.info("File loaded into BytesIO successfully")
719
-
720
- detector = EventDetector()
721
-
722
- # Read and deduplicate data
723
- df = pd.read_excel(file_obj, sheet_name='Публикации')
724
- original_count = len(df)
725
- df = fuzzy_deduplicate(df, 'Выдержки из текста', threshold=55)
726
- removed_count = original_count - len(df)
727
- dedup_message = f"Удалено {removed_count} дубликатов из {original_count} записей"
728
- logger.info(f"Removed {removed_count} duplicate entries")
729
-
730
- processed_rows = []
731
- total = len(df)
732
- batch_size = 3
733
-
734
- for batch_start in range(0, total, batch_size):
735
- if control.should_stop():
736
- # Create partial results if stopped
737
- if processed_rows:
738
- result_df = pd.DataFrame(processed_rows)
739
- output = create_output_file(result_df, file_obj)
740
- if output:
741
- fig_sentiment, fig_events = create_visualizations(result_df)
742
- return (
743
- result_df,
744
- fig_sentiment,
745
- fig_events,
746
- True, # Show download button
747
- output, # Raw bytes
748
- f"Обработка остановлена. Обработано {len(processed_rows)}/{total} строк",
749
- dedup_message
750
- )
751
- break
752
-
753
- batch_end = min(batch_start + batch_size, total)
754
- batch = df.iloc[batch_start:batch_end]
755
-
756
- for idx, row in batch.iterrows():
757
- try:
758
- text = str(row.get('Выдержки из текста', '')).strip()
759
- entity = str(row.get('Объект', '')).strip()
760
-
761
- if not text or not entity:
762
- continue
763
-
764
- # Process with GPU
765
- results = detector.process_text(text, entity)
766
-
767
- processed_rows.append({
768
- 'Объект': entity,
769
- 'Заголовок': str(row.get('Заголовок', '')),
770
- 'Translated': results['translated_text'],
771
- 'Sentiment': results['sentiment'],
772
- 'Impact': results['impact'],
773
- 'Reasoning': results['reasoning'],
774
- 'Event_Type': results['event_type'],
775
- 'Event_Summary': results['event_summary'],
776
- 'Выдержки из текста': text[:1000]
777
- })
778
-
779
- except Exception as e:
780
- logger.error(f"Error processing row {idx}: {str(e)}")
781
- continue
782
-
783
- # Create intermediate results
784
- if processed_rows:
785
- result_df = pd.DataFrame(processed_rows)
786
- output = create_output_file(result_df, file_obj)
787
- if output:
788
- fig_sentiment, fig_events = create_visualizations(result_df)
789
- yield (
790
- result_df,
791
- fig_sentiment,
792
- fig_events,
793
- True, # Show download button
794
- output, # Raw bytes
795
- f"Обработано {len(processed_rows)}/{total} строк",
796
- dedup_message
797
- )
798
-
799
- # Cleanup GPU resources after batch
800
- torch.cuda.empty_cache()
801
- time.sleep(2)
802
-
803
- # Create final results
804
  if processed_rows:
805
- final_df = pd.DataFrame(processed_rows)
806
- output = create_output_file(final_df, file_obj)
807
- if output:
808
- fig_sentiment, fig_events = create_visualizations(final_df)
 
809
  return (
810
- final_df,
811
  fig_sentiment,
812
  fig_events,
813
- True, # Show download button
814
- output, # Raw bytes
815
- "Обработка завершена!",
816
  dedup_message
817
  )
818
- else:
819
- return None, None, None, False, None, "Нет обработанных данных", ""
 
 
 
 
 
 
 
820
 
821
- except Exception as e:
822
- error_msg = f"Ошибка анализа: {str(e)}"
823
- logger.error(error_msg)
824
- gr.Error(error_msg)
825
- return None, None, None, False, None, error_msg, ""
826
- finally:
827
- if detector:
828
- detector.cleanup()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
829
 
830
  def trigger_download(show_button, file_content):
831
  """Handle download button visibility and file content"""
 
645
  control = ProcessControl()
646
 
647
  with gr.Blocks(theme=gr.themes.Soft()) as app:
648
+ gr.Markdown("# AI-анализ мониторинга новостей v.1.33")
649
 
650
  with gr.Row():
651
  file_input = gr.File(
 
704
  control.request_stop()
705
  return "Остановка обработки..."
706
 
707
+ @spaces.GPU(duration=300)
708
+ def analyze(file_bytes):
709
+ if file_bytes is None:
710
+ gr.Warning("Пожалуйста, загрузите файл")
711
+ return None, None, None, False, None, "Ожидание файла...", ""
712
+
713
+ try:
714
+ # Reset stop flag
715
+ control.reset()
716
+
717
+ file_obj = io.BytesIO(file_bytes)
718
+ logger.info("File loaded into BytesIO successfully")
719
+
720
+ detector = EventDetector()
721
+
722
+ # Read and deduplicate data
723
+ df = pd.read_excel(file_obj, sheet_name='Публикации')
724
+ original_count = len(df)
725
+ df = fuzzy_deduplicate(df, 'Выдержки из текста', threshold=55)
726
+ removed_count = original_count - len(df)
727
+ dedup_message = f"Удалено {removed_count} дубликатов из {original_count} записей"
728
+ logger.info(f"Removed {removed_count} duplicate entries")
729
+
730
+ processed_rows = []
731
+ total = len(df)
732
+ batch_size = 3
733
+
734
+ for batch_start in range(0, total, batch_size):
735
+ if control.should_stop():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
736
  if processed_rows:
737
+ result_df = pd.DataFrame(processed_rows)
738
+ output_bytes_io = create_output_file(result_df, file_obj)
739
+ if output_bytes_io:
740
+ output_bytes = output_bytes_io.getvalue() # Convert BytesIO to bytes
741
+ fig_sentiment, fig_events = create_visualizations(result_df)
742
  return (
743
+ result_df,
744
  fig_sentiment,
745
  fig_events,
746
+ True,
747
+ output_bytes, # Return bytes instead of BytesIO
748
+ f"Обработка остановлена. Обработано {len(processed_rows)}/{total} строк",
749
  dedup_message
750
  )
751
+ break
752
+
753
+ batch_end = min(batch_start + batch_size, total)
754
+ batch = df.iloc[batch_start:batch_end]
755
+
756
+ for idx, row in batch.iterrows():
757
+ try:
758
+ text = str(row.get('Выдержки из текста', '')).strip()
759
+ entity = str(row.get('Объект', '')).strip()
760
 
761
+ if not text or not entity:
762
+ continue
763
+
764
+ # Process with GPU
765
+ results = detector.process_text(text, entity)
766
+
767
+ processed_rows.append({
768
+ 'Объект': entity,
769
+ 'Заголовок': str(row.get('Заголовок', '')),
770
+ 'Translated': results['translated_text'],
771
+ 'Sentiment': results['sentiment'],
772
+ 'Impact': results['impact'],
773
+ 'Reasoning': results['reasoning'],
774
+ 'Event_Type': results['event_type'],
775
+ 'Event_Summary': results['event_summary'],
776
+ 'Выдержки из текста': text[:1000]
777
+ })
778
+
779
+ except Exception as e:
780
+ logger.error(f"Error processing row {idx}: {str(e)}")
781
+ continue
782
+
783
+ # Create intermediate results
784
+ if processed_rows:
785
+ result_df = pd.DataFrame(processed_rows)
786
+ output_bytes_io = create_output_file(result_df, file_obj)
787
+ if output_bytes_io:
788
+ output_bytes = output_bytes_io.getvalue() # Convert BytesIO to bytes
789
+ fig_sentiment, fig_events = create_visualizations(result_df)
790
+ yield (
791
+ result_df,
792
+ fig_sentiment,
793
+ fig_events,
794
+ True,
795
+ output_bytes, # Return bytes instead of BytesIO
796
+ f"Обработано {len(processed_rows)}/{total} строк",
797
+ dedup_message
798
+ )
799
+
800
+ # Cleanup GPU resources after batch
801
+ torch.cuda.empty_cache()
802
+ time.sleep(2)
803
+
804
+ # Create final results
805
+ if processed_rows:
806
+ final_df = pd.DataFrame(processed_rows)
807
+ output_bytes_io = create_output_file(final_df, file_obj)
808
+ if output_bytes_io:
809
+ output_bytes = output_bytes_io.getvalue() # Convert BytesIO to bytes
810
+ fig_sentiment, fig_events = create_visualizations(final_df)
811
+ return (
812
+ final_df,
813
+ fig_sentiment,
814
+ fig_events,
815
+ True,
816
+ output_bytes, # Return bytes instead of BytesIO
817
+ "Обработка завершена!",
818
+ dedup_message
819
+ )
820
+ else:
821
+ return None, None, None, False, None, "Нет обработанных данных", ""
822
+
823
+ except Exception as e:
824
+ error_msg = f"Ошибка анализа: {str(e)}"
825
+ logger.error(error_msg)
826
+ gr.Error(error_msg)
827
+ return None, None, None, False, None, error_msg, ""
828
+ finally:
829
+ if detector:
830
+ detector.cleanup()
831
 
832
  def trigger_download(show_button, file_content):
833
  """Handle download button visibility and file content"""