Spaces:

Nerva5678
/

Excel-QA-bot

Sleeping

App Files Files Community

Excel-QA-bot / app.py

Nerva5678

Upload 2 files

2e4c503 verified 28 days ago

raw

history blame

10.7 kB

	import streamlit as st
	import pandas as pd
	import torch
	import os
	import time
	import logging
	from langchain.embeddings.huggingface import HuggingFaceEmbeddings
	from langchain.vectorstores import FAISS
	from langchain.llms import HuggingFacePipeline
	from langchain.chains import RetrievalQA, LLMChain
	from langchain.prompts import PromptTemplate
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

	# 設定logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# 頁面配置
	st.set_page_config(
	page_title="Excel 問答 AI（ChatGLM 驅動）",
	page_icon="🤖",
	layout="wide"
	)

	# 應用標題與說明
	st.title("🤖 Excel 問答 AI（ChatGLM 驅動）")
	st.markdown("""
	### 使用說明
	1. 可直接提問一般知識，AI 將使用內建能力回答
	2. 上傳 Excel 檔案（包含「問題」和「答案」欄位）以添加專業知識
	3. 系統會優先使用您上傳的知識庫進行回答
	""")

	# 側邊欄設定
	with st.sidebar:
	st.header("參數設定")

	model_option = st.selectbox(
	"選擇模型",
	["THUDM/chatglm3-6b", "THUDM/chatglm2-6b", "THUDM/chatglm-6b"],
	index=0
	)

	embedding_option = st.selectbox(
	"選擇嵌入模型",
	["shibing624/text2vec-base-chinese", "GanymedeNil/text2vec-large-chinese"],
	index=0
	)

	mode = st.radio(
	"回答模式",
	["混合模式（優先使用上傳資料）", "僅使用上傳資料", "僅使用模型知識"]
	)

	max_tokens = st.slider("最大回應長度", 128, 2048, 512)
	temperature = st.slider("溫度（創造性）", 0.0, 1.0, 0.7, 0.1)
	top_k = st.slider("檢索相關文檔數", 1, 5, 3)

	st.markdown("---")
	st.markdown("### 關於")
	st.markdown("此應用使用 ChatGLM 模型結合 LangChain 框架，將您的 Excel 數據轉化為智能問答系統。同時支持一般知識問答。")
	st.markdown("📱 [GitHub 專案連結](https://github.com/yourusername/excel-qa-chatglm)")

	# 全局變量
	@st.cache_resource
	def load_embeddings(model_name):
	try:
	logger.info(f"加載嵌入模型: {model_name}")
	return HuggingFaceEmbeddings(model_name=model_name)
	except Exception as e:
	logger.error(f"嵌入模型加載失敗: {str(e)}")
	st.error(f"嵌入模型加載失敗: {str(e)}")
	return None

	@st.cache_resource
	def load_llm(_model_name, _max_tokens, _temperature):
	try:
	logger.info(f"加載語言模型: {_model_name}")

	# 檢查是否有GPU可用
	device = "cuda" if torch.cuda.is_available() else "cpu"
	logger.info(f"使用設備: {device}")

	# 加載模型和tokenizer
	tokenizer = AutoTokenizer.from_pretrained(_model_name, trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained(
	_model_name,
	trust_remote_code=True,
	device_map=device,
	torch_dtype=torch.float16 if device == "cuda" else torch.float32
	)

	# 創建pipeline
	pipe = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	max_new_tokens=_max_tokens,
	temperature=_temperature,
	top_p=0.9,
	repetition_penalty=1.1
	)

	return HuggingFacePipeline(pipeline=pipe)
	except Exception as e:
	logger.error(f"語言模型加載失敗: {str(e)}")
	st.error(f"語言模型加載失敗: {str(e)}")
	return None

	# 創建向量資料庫
	def create_vectorstore(texts, embeddings):
	try:
	return FAISS.from_texts(texts, embedding=embeddings)
	except Exception as e:
	logger.error(f"向量資料庫創建失敗: {str(e)}")
	st.error(f"向量資料庫創建失敗: {str(e)}")
	return None

	# 創建直接問答的LLM鏈
	def create_general_qa_chain(llm):
	prompt_template = """請回答以下問題：

	問題: {question}

	請提供詳細且有幫助的回答："""

	prompt = PromptTemplate(
	template=prompt_template,
	input_variables=["question"]
	)

	return LLMChain(llm=llm, prompt=prompt)

	# 混合模式問答處理
	def hybrid_qa(query, qa_chain, general_chain, confidence_threshold=0.7):
	# 先嘗試使用知識庫回答
	try:
	kb_result = qa_chain({"query": query})
	# 檢查向量存儲的相似度分數，判斷是否有足夠相關的內容
	if hasattr(kb_result, 'source_documents') and len(kb_result["source_documents"]) > 0:
	# 這裡假設我們能獲取到相似度分數，實際上可能需要根據您使用的向量存儲方法調整
	relevance = True # 在實際應用中，這裡應根據相似度分數確定

	if relevance:
	return kb_result, "knowledge_base", kb_result["source_documents"]
	except Exception as e:
	logger.warning(f"知識庫查詢失敗: {str(e)}")

	# 如果知識庫沒有足夠相關的答案，使用一般知識模式
	try:
	general_result = general_chain.run(question=query)
	return {"result": general_result}, "general", []
	except Exception as e:
	logger.error(f"一般知識查詢失敗: {str(e)}")
	return {"result": "很抱歉，無法處理您的問題，請稍後再試。"}, "error", []

	# 主應用邏輯
	# 加載語言模型（不管是否上傳文件都需要）
	with st.spinner("正在加載AI模型..."):
	llm = load_llm(model_option, max_tokens, temperature)
	if llm is None:
	st.error("語言模型加載失敗，請刷新頁面重試")
	st.stop()

	# 創建一般問答鏈
	general_qa_chain = create_general_qa_chain(llm)

	# 變數初始化
	kb_qa_chain = None
	has_knowledge_base = False
	vectorstore = None

	# 上傳Excel文件
	uploaded_file = st.file_uploader("上傳你的問答 Excel（可選）", type=["xlsx"])

	if uploaded_file:
	# 讀取Excel文件
	try:
	df = pd.read_excel(uploaded_file)

	# 檢查必要欄位
	if not {'問題', '答案'}.issubset(df.columns):
	st.error("Excel 檔案需包含 '問題' 和 '答案' 欄位")
	else:
	# 顯示資料預覽
	with st.expander("Excel 資料預覽"):
	st.dataframe(df.head())

	st.info(f"成功讀取 {len(df)} 筆問答對")

	# 建立文本列表
	texts = [f"問題：{q}\n答案：{a}" for q, a in zip(df['問題'], df['答案'])]

	# 進度條
	progress_text = "正在處理中..."
	my_bar = st.progress(0, text=progress_text)

	# 加載嵌入模型
	my_bar.progress(25, text="正在加載嵌入模型...")
	embeddings = load_embeddings(embedding_option)
	if embeddings is None:
	st.stop()

	# 建立向量資料庫
	my_bar.progress(50, text="正在建立向量資料庫...")
	vectorstore = create_vectorstore(texts, embeddings)
	if vectorstore is None:
	st.stop()

	# 創建問答鏈
	my_bar.progress(75, text="正在建立知識庫問答系統...")
	kb_qa_chain = RetrievalQA.from_chain_type(
	llm=llm,
	retriever=vectorstore.as_retriever(search_kwargs={"k": top_k}),
	chain_type="stuff",
	return_source_documents=True
	)

	has_knowledge_base = True

	my_bar.progress(100, text="準備完成！")
	time.sleep(1)
	my_bar.empty()

	st.success("知識庫已準備就緒，請輸入您的問題")

	except Exception as e:
	logger.error(f"Excel 檔案處理失敗: {str(e)}")
	st.error(f"Excel 檔案處理失敗: {str(e)}")

	# 查詢部分
	st.markdown("## 開始對話")
	query = st.text_input("請輸入你的問題：")

	if query:
	with st.spinner("AI 思考中..."):
	try:
	start_time = time.time()

	# 根據模式選擇問答方式
	if mode == "僅使用上傳資料":
	if has_knowledge_base:
	result = kb_qa_chain({"query": query})
	source = "knowledge_base"
	source_docs = result["source_documents"]
	else:
	st.warning("您選擇了僅使用上傳資料模式，但尚未上傳Excel檔案。請上傳檔案或變更模式。")
	st.stop()

	elif mode == "僅使用模型知識":
	result = {"result": general_qa_chain.run(question=query)}
	source = "general"
	source_docs = []

	else: # 混合模式
	if has_knowledge_base:
	result, source, source_docs = hybrid_qa(query, kb_qa_chain, general_qa_chain)
	else:
	result = {"result": general_qa_chain.run(question=query)}
	source = "general"
	source_docs = []

	end_time = time.time()

	# 顯示回答
	st.markdown("### AI 回答：")
	st.markdown(result["result"])

	# 根據來源顯示不同信息
	if source == "knowledge_base":
	st.success("✅ 回答來自您的知識庫")
	# 顯示參考資料
	with st.expander("參考資料"):
	for i, doc in enumerate(source_docs):
	st.markdown(f"參考 {i+1}")
	st.markdown(doc.page_content)
	st.markdown("---")
	elif source == "general":
	if has_knowledge_base:
	st.info("ℹ️ 回答來自模型的一般知識（知識庫中未找到相關內容）")
	else:
	st.info("ℹ️ 回答來自模型的一般知識")

	st.text(f"回答生成時間: {(end_time - start_time):.2f} 秒")

	except Exception as e:
	logger.error(f"查詢處理失敗: {str(e)}")
	st.error(f"查詢處理失敗，請重試: {str(e)}")

	# 添加會話歷史功能
	if "chat_history" not in st.session_state:
	st.session_state.chat_history = []

	# 底部資訊
	st.markdown("---")
	st.markdown("Made with ❤️ \| 若需支援，請聯繫 [[email protected]](mailto:[email protected])")