pentarosarium commited on
Commit
fa80eae
·
1 Parent(s): 7e40c67

progress spinner attempt and some pandas correction

Browse files
Files changed (2) hide show
  1. app.py +6 -7
  2. requirements.txt +3 -1
app.py CHANGED
@@ -2,7 +2,7 @@ import streamlit as st
2
  import pandas as pd
3
  import time
4
  from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
5
- from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
6
  #from transformers import MarianMTModel, MarianTokenizer
7
  import matplotlib.pyplot as plt
8
  from pymystem3 import Mystem
@@ -41,16 +41,13 @@ def translate(text):
41
  # Tokenize the input text
42
  inputs = translation_tokenizer(text, return_tensors="pt", truncation=True)
43
 
44
- # Get the number of tokens in the input
45
- input_length = inputs.input_ids.shape[1]
46
-
47
  # Set up a simple spinner
48
  with tqdm(total=0, bar_format='{desc}', desc="Translating...") as pbar:
49
  # Generate translation
50
  translated_tokens = translation_model.generate(
51
  **inputs,
52
  num_beams=5,
53
- max_length=input_length * 2, # Adjust as needed
54
  no_repeat_ngram_size=2,
55
  early_stopping=True
56
  )
@@ -112,9 +109,11 @@ def fuzzy_deduplicate(df, column, threshold=65):
112
  def process_file(uploaded_file):
113
  df = pd.read_excel(uploaded_file, sheet_name='Публикации')
114
 
 
 
115
  # Apply fuzzy deduplication
116
- df = df.groupby('Объект').apply(lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)).reset_index(drop=True)
117
-
118
  # Translate texts
119
  translated_texts = []
120
  progress_bar = st.progress(0)
 
2
  import pandas as pd
3
  import time
4
  from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
5
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
6
  #from transformers import MarianMTModel, MarianTokenizer
7
  import matplotlib.pyplot as plt
8
  from pymystem3 import Mystem
 
41
  # Tokenize the input text
42
  inputs = translation_tokenizer(text, return_tensors="pt", truncation=True)
43
 
 
 
 
44
  # Set up a simple spinner
45
  with tqdm(total=0, bar_format='{desc}', desc="Translating...") as pbar:
46
  # Generate translation
47
  translated_tokens = translation_model.generate(
48
  **inputs,
49
  num_beams=5,
50
+ max_length=len(text.split()) * 2, # Adjust as needed
51
  no_repeat_ngram_size=2,
52
  early_stopping=True
53
  )
 
109
  def process_file(uploaded_file):
110
  df = pd.read_excel(uploaded_file, sheet_name='Публикации')
111
 
112
+
113
+
114
  # Apply fuzzy deduplication
115
+ df = df.groupby('Объект', group_keys=False).apply(lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)).reset_index(drop=True)
116
+
117
  # Translate texts
118
  translated_texts = []
119
  progress_bar = st.progress(0)
requirements.txt CHANGED
@@ -3,8 +3,10 @@ pandas
3
  vaderSentiment
4
  transformers>=4.30.0
5
  torch
 
6
  sentencepiece
7
  pymystem3
8
  openpyxl
9
  rapidfuzz
10
- matplotlib
 
 
3
  vaderSentiment
4
  transformers>=4.30.0
5
  torch
6
+ tqdm
7
  sentencepiece
8
  pymystem3
9
  openpyxl
10
  rapidfuzz
11
+ matplotlib
12
+ sacremoses