Spaces:

amiguel
/

RAG

Sleeping

App Files Files Community

RAG / app.py

amiguel

Update app.py

a4869da verified 18 days ago

raw

history blame contribute delete

5.84 kB

	import streamlit as st
	import pandas as pd
	import tempfile
	import os
	import json
	from pathlib import Path

	from langchain.schema import Document
	#from langchain.document_loaders import Document
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.vectorstores import FAISS
	from langchain.chains import RetrievalQAWithSourcesChain
	from langchain import HuggingFacePipeline
	from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

	USER_AVATAR = "https://raw.githubusercontent.com/achilela/vila_fofoka_analysis/9904d9a0d445ab0488cf7395cb863cce7621d897/USER_AVATAR.png"
	BOT_AVATAR = "https://raw.githubusercontent.com/achilela/vila_fofoka_analysis/991f4c6e4e1dc7a8e24876ca5aae5228bcdb4dba/Ataliba_Avatar.jpg"
	CHAT_HISTORY_FILE = Path("chat_memory.json")

	def load_chat_history():
	if CHAT_HISTORY_FILE.exists():
	with open(CHAT_HISTORY_FILE, "r") as f:
	return json.load(f)
	return []

	def save_chat_history(history):
	with open(CHAT_HISTORY_FILE, "w") as f:
	json.dump(history, f)

	def preprocess_excel(file_path: str) -> pd.DataFrame:
	df_raw = pd.read_excel(file_path, sheet_name='Data Base', header=None)
	df = df_raw.iloc[4:].copy()
	df.columns = df.iloc[0]
	df = df[1:]
	df.dropna(how='all', inplace=True)
	df.dropna(axis=1, how='all', inplace=True)
	df.reset_index(drop=True, inplace=True)
	df.columns = df.columns.astype(str)
	return df

	def build_vectorstore_from_structured_records(df: pd.DataFrame):
	df.fillna("", inplace=True)
	records = []
	for i, row in df.iterrows():
	item_class = str(row.get("Item Class", "")).strip()
	job_done = str(row.get("Job Done", "")).strip()
	backlog = str(row.get("Backlog?", "")).strip()
	days = str(row.get("Days in Backlog", "")).strip()
	if not any([item_class, job_done, backlog, days]):
	continue
	sentence = f"Item Class {item_class} has status {job_done}, is in {backlog} backlog, and has {days} days."
	records.append(Document(page_content=sentence, metadata={"source": f"Row {i+1}"}))

	splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
	split_docs = splitter.split_documents(records)

	embeddings = HuggingFaceEmbeddings(
	model_name="sentence-transformers/all-MiniLM-l6-v2",
	model_kwargs={"device": "cpu"},
	encode_kwargs={"normalize_embeddings": False}
	)
	vectorstore = FAISS.from_documents(split_docs, embeddings)
	return vectorstore

	def create_qa_pipeline(vectorstore):
	model_id = "google/flan-t5-base"
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
	gen_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
	llm = HuggingFacePipeline(pipeline=gen_pipeline)
	retriever = vectorstore.as_retriever()
	qa = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=retriever)
	return qa

	st.set_page_config(page_title="Excel-Aware RAG Chatbot", layout="wide")
	st.title("📊 Excel-Aware RAG Chatbot (Structured QA)")

	with st.sidebar:
	uploaded_file = st.file_uploader("Upload your Excel file (.xlsx or .xlsm with 'Data Base' sheet)", type=["xlsx", "xlsm"])
	if st.button("🗑️ Clear Chat History"):
	st.session_state.chat_history = []
	if CHAT_HISTORY_FILE.exists():
	CHAT_HISTORY_FILE.unlink()
	st.rerun()

	if "chat_history" not in st.session_state:
	st.session_state.chat_history = load_chat_history()

	if uploaded_file is not None:
	with st.spinner("Processing and indexing your Excel sheet..."):
	with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsm") as tmp_file:
	tmp_file.write(uploaded_file.read())
	tmp_path = tmp_file.name

	try:
	df = preprocess_excel(tmp_path)
	vectorstore = build_vectorstore_from_structured_records(df)
	qa = create_qa_pipeline(vectorstore)
	st.success("✅ File processed and chatbot ready! Ask your questions below.")
	except Exception as e:
	st.error(f"❌ Error processing file: {e}")
	finally:
	os.remove(tmp_path)

	for message in st.session_state.chat_history:
	st.chat_message(message["role"], avatar=USER_AVATAR if message["role"] == "user" else BOT_AVATAR).markdown(message["content"])

	user_prompt = st.chat_input("Ask about item classes, backlog, or status...")

	if user_prompt:
	st.session_state.chat_history.append({"role": "user", "content": user_prompt})
	st.chat_message("user", avatar=USER_AVATAR).markdown(user_prompt)

	with st.chat_message("assistant", avatar=BOT_AVATAR):
	with st.spinner("Thinking..."):
	try:
	response = qa.invoke({"question": user_prompt})
	final_response = response['answer']
	sources = response.get('sources', '')
	placeholder = st.empty()
	streamed = ""
	for word in final_response.split():
	streamed += word + " "
	placeholder.markdown(streamed + "▌")
	placeholder.markdown(f"{final_response.strip()}")
	if sources:
	st.markdown(f"<sub>📎 <i>{sources}</i></sub>", unsafe_allow_html=True)
	st.session_state.chat_history.append({"role": "assistant", "content": final_response})
	save_chat_history(st.session_state.chat_history)
	except Exception as e:
	st.error(f"❌ Error: {e}")
	else:
	st.info("Upload a file on the left to get started.")