File size: 5,624 Bytes
af7a4c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03bf821
 
 
 
 
 
af7a4c4
03bf821
 
 
 
af7a4c4
 
03bf821
af7a4c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import streamlit as st
import pandas as pd
import tempfile
import os
import json
from pathlib import Path

from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQAWithSourcesChain
from langchain import HuggingFacePipeline
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

USER_AVATAR = "https://raw.githubusercontent.com/achilela/vila_fofoka_analysis/9904d9a0d445ab0488cf7395cb863cce7621d897/USER_AVATAR.png"
BOT_AVATAR = "https://raw.githubusercontent.com/achilela/vila_fofoka_analysis/991f4c6e4e1dc7a8e24876ca5aae5228bcdb4dba/Ataliba_Avatar.jpg"
CHAT_HISTORY_FILE = Path("chat_memory.json")

def load_chat_history():
    if CHAT_HISTORY_FILE.exists():
        with open(CHAT_HISTORY_FILE, "r") as f:
            return json.load(f)
    return []

def save_chat_history(history):
    with open(CHAT_HISTORY_FILE, "w") as f:
        json.dump(history, f)

def preprocess_excel(file_path: str) -> pd.DataFrame:
    df_raw = pd.read_excel(file_path, sheet_name='Data Base', header=None)
    df = df_raw.iloc[4:].copy()
    df.columns = df.iloc[0]
    df = df[1:]
    df.dropna(how='all', inplace=True)
    df.dropna(axis=1, how='all', inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

def build_vectorstore_from_dataframe(df: pd.DataFrame):
    df.fillna("", inplace=True)
    df['combined_text'] = df.apply(lambda row: ' | '.join([str(cell) for cell in row]), axis=1)

    docs_loader = DataFrameLoader(
        df[['combined_text']].rename(columns={"combined_text": "text"}),
        page_content_column="text",
        metadata_columns=["combined_text"]
    )
    documents = docs_loader.load()

    for i, doc in enumerate(documents):
        doc.metadata["source"] = f"Row {i+1}"

    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    split_docs = splitter.split_documents(documents)

    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-l6-v2",
        model_kwargs={"device": "cpu"},
        encode_kwargs={"normalize_embeddings": False}
    )
    vectorstore = FAISS.from_documents(split_docs, embeddings)
    return vectorstore

def create_qa_pipeline(vectorstore):
    model_id = "google/flan-t5-base"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
    gen_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
    llm = HuggingFacePipeline(pipeline=gen_pipeline)
    retriever = vectorstore.as_retriever()
    qa = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=retriever)
    return qa

st.set_page_config(page_title="Excel-Aware RAG Chatbot", layout="wide")
st.title("πŸ“Š Excel-Aware RAG Chatbot (Professional QA)")

with st.sidebar:
    uploaded_file = st.file_uploader("Upload your Excel file (.xlsx or .xlsm with 'Data Base' sheet)", type=["xlsx", "xlsm"])
    if st.button("πŸ—‘οΈ Clear Chat History"):
        st.session_state.chat_history = []
        if CHAT_HISTORY_FILE.exists():
            CHAT_HISTORY_FILE.unlink()
        st.rerun()

if "chat_history" not in st.session_state:
    st.session_state.chat_history = load_chat_history()

if uploaded_file is not None:
    with st.spinner("Processing and indexing your Excel sheet..."):
        with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsm") as tmp_file:
            tmp_file.write(uploaded_file.read())
            tmp_path = tmp_file.name

        try:
            cleaned_df = preprocess_excel(tmp_path)
            vectorstore = build_vectorstore_from_dataframe(cleaned_df)
            qa = create_qa_pipeline(vectorstore)
            st.success("βœ… File processed and chatbot ready! Ask your questions below.")
        except Exception as e:
            st.error(f"❌ Error processing file: {e}")
        finally:
            os.remove(tmp_path)

    for message in st.session_state.chat_history:
        st.chat_message(message["role"], avatar=USER_AVATAR if message["role"] == "user" else BOT_AVATAR).markdown(message["content"])

    user_prompt = st.chat_input("Ask about inspections, delays, backlog...")

    if user_prompt:
        st.session_state.chat_history.append({"role": "user", "content": user_prompt})
        st.chat_message("user", avatar=USER_AVATAR).markdown(user_prompt)

        with st.chat_message("assistant", avatar=BOT_AVATAR):
            with st.spinner("Searching and generating..."):
                try:
                    response = qa.invoke({"question": user_prompt})
                    final_response = response['answer']
                    sources = response.get('sources', '')
                    placeholder = st.empty()
                    streamed = ""
                    for word in final_response.split():
                        streamed += word + " "
                        placeholder.markdown(streamed + "β–Œ")
                    placeholder.markdown(f"**{final_response.strip()}**")
                    if sources:
                        st.markdown(f"<sub>πŸ“Ž <i>{sources}</i></sub>", unsafe_allow_html=True)
                    st.session_state.chat_history.append({"role": "assistant", "content": final_response})
                    save_chat_history(st.session_state.chat_history)
                except Exception as e:
                    st.error(f"❌ Error: {e}")
else:
    st.info("Upload a file on the left to get started.")