Spaces:
Running
Running
Commit
·
fa80eae
1
Parent(s):
7e40c67
progress spinner attempt and some pandas correction
Browse files- app.py +6 -7
- requirements.txt +3 -1
app.py
CHANGED
@@ -2,7 +2,7 @@ import streamlit as st
|
|
2 |
import pandas as pd
|
3 |
import time
|
4 |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
5 |
-
from transformers import
|
6 |
#from transformers import MarianMTModel, MarianTokenizer
|
7 |
import matplotlib.pyplot as plt
|
8 |
from pymystem3 import Mystem
|
@@ -41,16 +41,13 @@ def translate(text):
|
|
41 |
# Tokenize the input text
|
42 |
inputs = translation_tokenizer(text, return_tensors="pt", truncation=True)
|
43 |
|
44 |
-
# Get the number of tokens in the input
|
45 |
-
input_length = inputs.input_ids.shape[1]
|
46 |
-
|
47 |
# Set up a simple spinner
|
48 |
with tqdm(total=0, bar_format='{desc}', desc="Translating...") as pbar:
|
49 |
# Generate translation
|
50 |
translated_tokens = translation_model.generate(
|
51 |
**inputs,
|
52 |
num_beams=5,
|
53 |
-
max_length=
|
54 |
no_repeat_ngram_size=2,
|
55 |
early_stopping=True
|
56 |
)
|
@@ -112,9 +109,11 @@ def fuzzy_deduplicate(df, column, threshold=65):
|
|
112 |
def process_file(uploaded_file):
|
113 |
df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
114 |
|
|
|
|
|
115 |
# Apply fuzzy deduplication
|
116 |
-
df = df.groupby('Объект').apply(lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)).reset_index(drop=True)
|
117 |
-
|
118 |
# Translate texts
|
119 |
translated_texts = []
|
120 |
progress_bar = st.progress(0)
|
|
|
2 |
import pandas as pd
|
3 |
import time
|
4 |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
5 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
6 |
#from transformers import MarianMTModel, MarianTokenizer
|
7 |
import matplotlib.pyplot as plt
|
8 |
from pymystem3 import Mystem
|
|
|
41 |
# Tokenize the input text
|
42 |
inputs = translation_tokenizer(text, return_tensors="pt", truncation=True)
|
43 |
|
|
|
|
|
|
|
44 |
# Set up a simple spinner
|
45 |
with tqdm(total=0, bar_format='{desc}', desc="Translating...") as pbar:
|
46 |
# Generate translation
|
47 |
translated_tokens = translation_model.generate(
|
48 |
**inputs,
|
49 |
num_beams=5,
|
50 |
+
max_length=len(text.split()) * 2, # Adjust as needed
|
51 |
no_repeat_ngram_size=2,
|
52 |
early_stopping=True
|
53 |
)
|
|
|
109 |
def process_file(uploaded_file):
|
110 |
df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
111 |
|
112 |
+
|
113 |
+
|
114 |
# Apply fuzzy deduplication
|
115 |
+
df = df.groupby('Объект', group_keys=False).apply(lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)).reset_index(drop=True)
|
116 |
+
|
117 |
# Translate texts
|
118 |
translated_texts = []
|
119 |
progress_bar = st.progress(0)
|
requirements.txt
CHANGED
@@ -3,8 +3,10 @@ pandas
|
|
3 |
vaderSentiment
|
4 |
transformers>=4.30.0
|
5 |
torch
|
|
|
6 |
sentencepiece
|
7 |
pymystem3
|
8 |
openpyxl
|
9 |
rapidfuzz
|
10 |
-
matplotlib
|
|
|
|
3 |
vaderSentiment
|
4 |
transformers>=4.30.0
|
5 |
torch
|
6 |
+
tqdm
|
7 |
sentencepiece
|
8 |
pymystem3
|
9 |
openpyxl
|
10 |
rapidfuzz
|
11 |
+
matplotlib
|
12 |
+
sacremoses
|