Spaces:

pentarosarium
/

processor

Running

App Files Files Community

processor / app.py

pentarosarium

progress more 75

1254c79 7 months ago

raw

history blame

13.5 kB

	import streamlit as st
	import pandas as pd
	import time
	import matplotlib.pyplot as plt
	from openpyxl.utils.dataframe import dataframe_to_rows
	import io
	from rapidfuzz import fuzz
	import os
	from openpyxl import load_workbook
	from langchain_community.chat_models import ChatOpenAI
	from langchain.prompts import PromptTemplate
	from langchain_core.runnables import RunnablePassthrough

	def fuzzy_deduplicate(df, column, threshold=65):
	seen_texts = []
	indices_to_keep = []
	for i, text in enumerate(df[column]):
	if pd.isna(text):
	indices_to_keep.append(i)
	continue
	text = str(text)
	if not seen_texts or all(fuzz.ratio(text, seen) < threshold for seen in seen_texts):
	seen_texts.append(text)
	indices_to_keep.append(i)
	return df.iloc[indices_to_keep]

	def init_langchain_llm():
	try:
	if 'groq_key' in st.secrets:
	groq_api_key = st.secrets['groq_key']
	else:
	st.error("Groq API key not found in Hugging Face secrets. Please add it with the key 'groq_key'.")
	st.stop()

	llm = ChatOpenAI(
	base_url="https://api.groq.com/openai/v1",
	model="llama-3.1-70b-versatile",
	api_key=groq_api_key,
	temperature=0.0
	)
	return llm
	except Exception as e:
	st.error(f"Error initializing the Groq LLM: {str(e)}")
	st.stop()

	def estimate_sentiment_and_impact(llm, news_text, entity):
	template = """
	First, translate this Russian text into English:
	"{news}"

	Then, analyze the translated text about the entity "{entity}" and determine:
	1. Sentiment (Positive/Negative/Neutral)
	2. Estimate potential financial impact in Russian rubles for this entity in the next 6 months.

	If precise monetary estimate is not possible, categorize the impact as one of the following:
	1. "Significant risk of loss"
	2. "Moderate risk of loss"
	3. "Minor risk of loss"
	4. "Probability of profit"
	5. "Uncertain effect"

	Provide a brief reasoning (maximum 100 words).

	Your response should be in the following format:
	Translation: [Your English translation]
	Sentiment: [Positive/Negative/Neutral]
	Impact: [Your estimate or category]
	Reasoning: [Your reasoning]
	"""
	prompt = PromptTemplate(template=template, input_variables=["entity", "news"])
	chain = prompt \| llm \| RunnablePassthrough()
	response = chain.invoke({"entity": entity, "news": news_text})

	sentiment = "Neutral"
	impact = "Uncertain effect"
	reasoning = "Unable to provide reasoning"

	if isinstance(response, str):
	try:
	# Extract sentiment
	if "Sentiment:" in response:
	sentiment_part = response.split("Sentiment:")[1].split("\n")[0].strip().lower()
	if "positive" in sentiment_part:
	sentiment = "Positive"
	elif "negative" in sentiment_part:
	sentiment = "Negative"

	# Extract impact and reasoning
	if "Impact:" in response and "Reasoning:" in response:
	impact_part, reasoning_part = response.split("Reasoning:")
	impact = impact_part.split("Impact:")[1].strip()
	reasoning = reasoning_part.strip()

	# Translate impact categories back to Russian
	impact_mapping = {
	"Significant risk of loss": "Значительный риск убытков",
	"Moderate risk of loss": "Умеренный риск убытков",
	"Minor risk of loss": "Незначительный риск убытков",
	"Probability of profit": "Вероятность прибыли",
	"Uncertain effect": "Неопределенный эффект"
	}

	for eng, rus in impact_mapping.items():
	if eng.lower() in impact.lower():
	impact = rus
	break

	except Exception as e:
	st.error(f"Error parsing LLM response: {str(e)}")

	return sentiment, impact, reasoning

	def format_elapsed_time(seconds):
	hours, remainder = divmod(int(seconds), 3600)
	minutes, seconds = divmod(remainder, 60)

	time_parts = []
	if hours > 0:
	time_parts.append(f"{hours} час{'ов' if hours != 1 else ''}")
	if minutes > 0:
	time_parts.append(f"{minutes} минут{'' if minutes == 1 else 'ы' if 2 <= minutes <= 4 else ''}")
	if seconds > 0 or not time_parts:
	time_parts.append(f"{seconds} секунд{'а' if seconds == 1 else 'ы' if 2 <= seconds <= 4 else ''}")

	return " ".join(time_parts)

	def generate_sentiment_visualization(df):
	negative_df = df[df['Sentiment'] == 'Negative']

	if negative_df.empty:
	st.warning("Не обнаружено негативных упоминаний. Отображаем общую статистику по объектам.")
	entity_counts = df['Объект'].value_counts()
	else:
	entity_counts = negative_df['Объект'].value_counts()

	if len(entity_counts) == 0:
	st.warning("Нет данных для визуализации.")
	return None

	fig, ax = plt.subplots(figsize=(12, max(6, len(entity_counts) * 0.5)))
	entity_counts.plot(kind='barh', ax=ax)
	ax.set_title('Количество негативных упоминаний по объектам')
	ax.set_xlabel('Количество упоминаний')
	plt.tight_layout()
	return fig

	def process_file(uploaded_file):
	df = pd.read_excel(uploaded_file, sheet_name='Публикации')

	required_columns = ['Объект', 'Заголовок', 'Выдержки из текста']
	missing_columns = [col for col in required_columns if col not in df.columns]
	if missing_columns:
	st.error(f"Error: The following required columns are missing from the input file: {', '.join(missing_columns)}")
	st.stop()

	original_news_count = len(df)
	df = df.groupby('Объект').apply(
	lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)
	).reset_index(drop=True)

	remaining_news_count = len(df)
	duplicates_removed = original_news_count - remaining_news_count
	st.write(f"Из {original_news_count} новостных сообщений удалены {duplicates_removed} дублирующих. Осталось {remaining_news_count}.")

	llm = init_langchain_llm()
	if not llm:
	st.error("Не удалось инициализировать нейросеть. Пожалуйста, проверьте настройки и попробуйте снова.")
	st.stop()

	df['Sentiment'] = ''
	df['Impact'] = ''
	df['Reasoning'] = ''

	progress_bar = st.progress(0)
	status_text = st.empty()

	for index, row in df.iterrows():
	sentiment, impact, reasoning = estimate_sentiment_and_impact(
	llm,
	row['Выдержки из текста'],
	row['Объект']
	)

	df.at[index, 'Sentiment'] = sentiment
	df.at[index, 'Impact'] = impact
	df.at[index, 'Reasoning'] = reasoning

	progress = (index + 1) / len(df)
	progress_bar.progress(progress)
	status_text.text(f"Проанализировано {index + 1} из {len(df)} новостей")

	st.write(f"Объект: {row['Объект']}")
	st.write(f"Новость: {row['Заголовок']}")
	st.write(f"Тональность: {sentiment}")
	st.write(f"Эффект: {impact}")
	st.write(f"Обоснование: {reasoning}")
	st.write("---")

	progress_bar.empty()
	status_text.empty()

	visualization = generate_sentiment_visualization(df)
	if visualization:
	st.pyplot(visualization)

	return df

	def create_analysis_data(df):
	analysis_data = []
	for _, row in df.iterrows():
	if row['Sentiment'] == 'Negative':
	analysis_data.append([
	row['Объект'],
	row['Заголовок'],
	'РИСК УБЫТКА',
	row['Impact'],
	row['Reasoning'],
	row['Выдержки из текста']
	])
	return pd.DataFrame(analysis_data, columns=[
	'Объект',
	'Заголовок',
	'Признак',
	'Оценка влияния',
	'Обоснование',
	'Текст сообщения'
	])

	def create_output_file(df, uploaded_file):
	wb = load_workbook("sample_file.xlsx")

	summary_df = pd.DataFrame({
	'Объект': df['Объект'].unique(),
	'Всего новостей': df.groupby('Объект').size(),
	'Негативные': df[df['Sentiment'] == 'Negative'].groupby('Объект').size().fillna(0).astype(int),
	'Позитивные': df[df['Sentiment'] == 'Positive'].groupby('Объект').size().fillna(0).astype(int),
	'Преобладающий эффект': df.groupby('Объект')['Impact'].agg(
	lambda x: x.value_counts().index[0] if len(x) > 0 else 'Неопределенный'
	)
	})

	summary_df = summary_df.sort_values('Негативные', ascending=False)

	ws = wb['Сводка']
	for r_idx, row in enumerate(dataframe_to_rows(summary_df, index=False, header=True), start=4):
	for c_idx, value in enumerate(row, start=5):
	ws.cell(row=r_idx, column=c_idx, value=value)

	significant_data = []
	for _, row in df.iterrows():
	if row['Sentiment'] in ['Negative', 'Positive']:
	significant_data.append([
	row['Объект'],
	'релевантен',
	row['Sentiment'],
	row['Impact'],
	row['Заголовок'],
	row['Выдержки из текста']
	])

	ws = wb['Значимые']
	for r_idx, row in enumerate(significant_data, start=3):
	for c_idx, value in enumerate(row, start=3):
	ws.cell(row=r_idx, column=c_idx, value=value)

	analysis_df = create_analysis_data(df)
	ws = wb['Анализ']
	for r_idx, row in enumerate(dataframe_to_rows(analysis_df, index=False, header=True), start=4):
	for c_idx, value in enumerate(row, start=5):
	ws.cell(row=r_idx, column=c_idx, value=value)

	original_df = pd.read_excel(uploaded_file, sheet_name='Публикации')
	ws = wb['Публикации']
	for r_idx, row in enumerate(dataframe_to_rows(original_df, index=False, header=True), start=1):
	for c_idx, value in enumerate(row, start=1):
	ws.cell(row=r_idx, column=c_idx, value=value)

	if 'Тех.приложение' not in wb.sheetnames:
	wb.create_sheet('Тех.приложение')
	ws = wb['Тех.приложение']
	for r_idx, row in enumerate(dataframe_to_rows(df, index=False, header=True), start=1):
	for c_idx, value in enumerate(row, start=1):
	ws.cell(row=r_idx, column=c_idx, value=value)

	output = io.BytesIO()
	wb.save(output)
	output.seek(0)
	return output

	def main():
	st.markdown(
	"""
	<style>
	.signature {
	position: fixed;
	right: 12px;
	bottom: 12px;
	font-size: 14px;
	color: #FF0000;
	opacity: 0.9;
	z-index: 999;
	}
	</style>
	<div class="signature">denis.pokrovsky.npff</div>
	""",
	unsafe_allow_html=True
	)

	st.title("::: анализ мониторинга новостей СКАН-ИНТЕРФАКС :::")

	if 'processed_df' not in st.session_state:
	st.session_state.processed_df = None

	uploaded_file = st.file_uploader("Выбирайте Excel-файл", type="xlsx")

	if uploaded_file is not None and st.session_state.processed_df is None:
	start_time = time.time()

	st.session_state.processed_df = process_file(uploaded_file)

	st.subheader("Предпросмотр данных")
	preview_df = st.session_state.processed_df[['Объект', 'Заголовок', 'Sentiment', 'Impact']].head()
	st.dataframe(preview_df)

	analysis_df = create_analysis_data(st.session_state.processed_df)
	st.subheader("Анализ")
	st.dataframe(analysis_df)

	output = create_output_file(st.session_state.processed_df, uploaded_file)

	end_time = time.time()
	elapsed_time = end_time - start_time
	formatted_time = format_elapsed_time(elapsed_time)
	st.success(f"Обработка и анализ завершены за {formatted_time}.")

	st.download_button(
	label="Скачать результат анализа",
	data=output,
	file_name="результат_анализа.xlsx",
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
	)

	if __name__ == "__main__":
	main()