Spaces:
Running
Running
Commit
·
0b49979
1
Parent(s):
8c530ad
progress more 36+
Browse files
app.py
CHANGED
@@ -32,7 +32,6 @@ def create_analysis_data(df):
|
|
32 |
analysis_data.append([row['Объект'], 'РИСК УБЫТКА', row['Заголовок'], row['Выдержки из текста']])
|
33 |
return pd.DataFrame(analysis_data, columns=['Объект', 'Тип риска', 'Заголовок', 'Текст'])
|
34 |
|
35 |
-
|
36 |
# Function for lemmatizing Russian text
|
37 |
def lemmatize_text(text):
|
38 |
words = text.split()
|
@@ -116,9 +115,14 @@ def fuzzy_deduplicate(df, column, threshold=65):
|
|
116 |
|
117 |
|
118 |
def process_file(uploaded_file):
|
119 |
-
|
120 |
df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
original_news_count = len(df)
|
123 |
|
124 |
# Apply fuzzy deduplication
|
@@ -126,13 +130,11 @@ def process_file(uploaded_file):
|
|
126 |
lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)
|
127 |
).reset_index(drop=True)
|
128 |
|
129 |
-
|
130 |
remaining_news_count = len(df)
|
131 |
duplicates_removed = original_news_count - remaining_news_count
|
132 |
|
133 |
st.write(f"Из {original_news_count} новостных сообщений удалены {duplicates_removed} дублирующих. Осталось {remaining_news_count}.")
|
134 |
|
135 |
-
|
136 |
# Translate texts
|
137 |
translated_texts = []
|
138 |
lemmatized_texts = []
|
@@ -152,25 +154,24 @@ def process_file(uploaded_file):
|
|
152 |
progress_text.text(f"{i + 1} из {total_news} сообщений предобработано")
|
153 |
|
154 |
# Perform sentiment analysis
|
155 |
-
#rubert1_results = [get_rubert1_sentiment(text) for text in texts]
|
156 |
rubert2_results = [get_rubert2_sentiment(text) for text in texts]
|
157 |
finbert_results = [get_finbert_sentiment(text) for text in translated_texts]
|
158 |
roberta_results = [get_roberta_sentiment(text) for text in translated_texts]
|
159 |
finbert_tone_results = [get_finbert_tone_sentiment(text) for text in translated_texts]
|
160 |
|
161 |
-
#
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
|
173 |
-
return
|
174 |
|
175 |
def create_output_file(df, uploaded_file, analysis_df):
|
176 |
# Create a new workbook
|
@@ -238,7 +239,7 @@ def create_output_file(df, uploaded_file, analysis_df):
|
|
238 |
return output
|
239 |
|
240 |
def main():
|
241 |
-
st.title("... приступим к анализу... версия
|
242 |
|
243 |
uploaded_file = st.file_uploader("Выбирайте Excel-файл", type="xlsx")
|
244 |
|
|
|
32 |
analysis_data.append([row['Объект'], 'РИСК УБЫТКА', row['Заголовок'], row['Выдержки из текста']])
|
33 |
return pd.DataFrame(analysis_data, columns=['Объект', 'Тип риска', 'Заголовок', 'Текст'])
|
34 |
|
|
|
35 |
# Function for lemmatizing Russian text
|
36 |
def lemmatize_text(text):
|
37 |
words = text.split()
|
|
|
115 |
|
116 |
|
117 |
def process_file(uploaded_file):
|
|
|
118 |
df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
119 |
|
120 |
+
required_columns = ['Объект', 'Заголовок', 'Выдержки из текста']
|
121 |
+
missing_columns = [col for col in required_columns if col not in df.columns]
|
122 |
+
if missing_columns:
|
123 |
+
st.error(f"Error: The following required columns are missing from the input file: {', '.join(missing_columns)}")
|
124 |
+
st.stop()
|
125 |
+
|
126 |
original_news_count = len(df)
|
127 |
|
128 |
# Apply fuzzy deduplication
|
|
|
130 |
lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)
|
131 |
).reset_index(drop=True)
|
132 |
|
|
|
133 |
remaining_news_count = len(df)
|
134 |
duplicates_removed = original_news_count - remaining_news_count
|
135 |
|
136 |
st.write(f"Из {original_news_count} новостных сообщений удалены {duplicates_removed} дублирующих. Осталось {remaining_news_count}.")
|
137 |
|
|
|
138 |
# Translate texts
|
139 |
translated_texts = []
|
140 |
lemmatized_texts = []
|
|
|
154 |
progress_text.text(f"{i + 1} из {total_news} сообщений предобработано")
|
155 |
|
156 |
# Perform sentiment analysis
|
|
|
157 |
rubert2_results = [get_rubert2_sentiment(text) for text in texts]
|
158 |
finbert_results = [get_finbert_sentiment(text) for text in translated_texts]
|
159 |
roberta_results = [get_roberta_sentiment(text) for text in translated_texts]
|
160 |
finbert_tone_results = [get_finbert_tone_sentiment(text) for text in translated_texts]
|
161 |
|
162 |
+
# Create a new DataFrame with processed data
|
163 |
+
processed_df = pd.DataFrame({
|
164 |
+
'Объект': df['Объект'],
|
165 |
+
'Заголовок': df['Заголовок'], # Preserve original 'Заголовок'
|
166 |
+
'ruBERT2': rubert2_results,
|
167 |
+
'FinBERT': finbert_results,
|
168 |
+
'RoBERTa': roberta_results,
|
169 |
+
'FinBERT-Tone': finbert_tone_results,
|
170 |
+
'Выдержки из текста': df['Выдержки из текста'],
|
171 |
+
'Translated': translated_texts
|
172 |
+
})
|
173 |
|
174 |
+
return processed_df
|
175 |
|
176 |
def create_output_file(df, uploaded_file, analysis_df):
|
177 |
# Create a new workbook
|
|
|
239 |
return output
|
240 |
|
241 |
def main():
|
242 |
+
st.title("... приступим к анализу... версия 36+")
|
243 |
|
244 |
uploaded_file = st.file_uploader("Выбирайте Excel-файл", type="xlsx")
|
245 |
|