pentarosarium commited on
Commit
f6e9269
·
1 Parent(s): 2d683e0

russification and optimization continued

Browse files
Files changed (1) hide show
  1. app.py +35 -3
app.py CHANGED
@@ -8,6 +8,8 @@ import matplotlib.pyplot as plt
8
  from pymystem3 import Mystem
9
  import io
10
  from rapidfuzz import fuzz
 
 
11
 
12
  # Initialize pymystem3 for lemmatization
13
  mystem = Mystem()
@@ -20,8 +22,12 @@ finbert_tone = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone")
20
 
21
  # Function for lemmatizing Russian text
22
  def lemmatize_text(text):
23
- lemmatized_text = ''.join(mystem.lemmatize(text))
24
- return lemmatized_text
 
 
 
 
25
 
26
  # Translation model for Russian to English
27
  model_name = "Helsinki-NLP/opus-mt-ru-en"
@@ -31,7 +37,33 @@ translation_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
31
  translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ru-en")
32
 
33
  def translate(text):
34
- return translator(text)[0]['translation_text']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  # Function for VADER sentiment analysis with label mapping
37
  def get_vader_sentiment(text):
 
8
  from pymystem3 import Mystem
9
  import io
10
  from rapidfuzz import fuzz
11
+ from tqdm import tqdm
12
+ import torch
13
 
14
  # Initialize pymystem3 for lemmatization
15
  mystem = Mystem()
 
22
 
23
  # Function for lemmatizing Russian text
24
  def lemmatize_text(text):
25
+ words = text.split()
26
+ lemmatized_words = []
27
+ for word in tqdm(words, desc="Lemmatizing", unit="word"):
28
+ lemmatized_word = ''.join(mystem.lemmatize(word))
29
+ lemmatized_words.append(lemmatized_word)
30
+ return ' '.join(lemmatized_words)
31
 
32
  # Translation model for Russian to English
33
  model_name = "Helsinki-NLP/opus-mt-ru-en"
 
37
  translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ru-en")
38
 
39
  def translate(text):
40
+ # Tokenize the input text
41
+ inputs = translation_tokenizer(text, return_tensors="pt", truncation=True)
42
+
43
+ # Get the number of tokens in the input
44
+ input_length = inputs.input_ids.shape[1]
45
+
46
+ # Set up the progress bar
47
+ progress_bar = tqdm(total=input_length, desc="Translating", unit="token")
48
+
49
+ # Custom callback to update the progress bar
50
+ def update_progress_bar(beam_idx, token_idx, token):
51
+ progress_bar.update(1)
52
+
53
+ # Generate translation with progress updates
54
+ translated_tokens = translation_model.generate(
55
+ **inputs,
56
+ num_beams=5,
57
+ max_length=input_length + 50, # Adjust as needed
58
+ callback=update_progress_bar
59
+ )
60
+
61
+ # Close the progress bar
62
+ progress_bar.close()
63
+
64
+ # Decode the translated tokens
65
+ translated_text = translation_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
66
+ return translated_text
67
 
68
  # Function for VADER sentiment analysis with label mapping
69
  def get_vader_sentiment(text):