amiguel commited on
Commit
623e43b
·
verified ·
1 Parent(s): 19b43e6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -1
app.py CHANGED
@@ -1,8 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  try:
2
  cleaned_df = preprocess_excel(tmp_path)
3
  vectorstore = build_vectorstore_from_dataframe(cleaned_df)
4
  qa = create_qa_pipeline(vectorstore)
5
- st.success("✅ File processed and chatbot ready! Ask your questions below.")
6
 
7
  if "chat_history" not in st.session_state:
8
  st.session_state.chat_history = []
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import tempfile
4
+ import os
5
+
6
+ from langchain.document_loaders import DataFrameLoader
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain.embeddings import HuggingFaceEmbeddings
9
+ from langchain.vectorstores import FAISS
10
+ from langchain.chains import RetrievalQA
11
+ from langchain import HuggingFacePipeline
12
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
13
+
14
+ def preprocess_excel(file_path: str) -> pd.DataFrame:
15
+ df_raw = pd.read_excel(file_path, sheet_name='Data Base', header=None)
16
+ df = df_raw.iloc[4:].copy()
17
+ df.columns = df.iloc[0]
18
+ df = df[1:]
19
+ df.dropna(how='all', inplace=True)
20
+ df.dropna(axis=1, how='all', inplace=True)
21
+ df.reset_index(drop=True, inplace=True)
22
+ return df
23
+
24
+ def build_vectorstore_from_dataframe(df: pd.DataFrame):
25
+ df.fillna("", inplace=True)
26
+ df['combined_text'] = df.apply(lambda row: ' | '.join([str(cell) for cell in row]), axis=1)
27
+
28
+ docs_loader = DataFrameLoader(df[['combined_text']], page_content_column='combined_text')
29
+ documents = docs_loader.load()
30
+
31
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
32
+ split_docs = splitter.split_documents(documents)
33
+
34
+ embeddings = HuggingFaceEmbeddings(
35
+ model_name="sentence-transformers/all-MiniLM-l6-v2",
36
+ model_kwargs={"device": "cpu"},
37
+ encode_kwargs={"normalize_embeddings": False}
38
+ )
39
+ vectorstore = FAISS.from_documents(split_docs, embeddings)
40
+ return vectorstore
41
+
42
+ def create_qa_pipeline(vectorstore):
43
+ model_id = "google/flan-t5-base"
44
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
45
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
46
+
47
+ gen_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
48
+ llm = HuggingFacePipeline(pipeline=gen_pipeline)
49
+
50
+ retriever = vectorstore.as_retriever()
51
+ qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type="stuff", return_source_documents=False)
52
+ return qa
53
+
54
+ st.set_page_config(page_title="Excel-Aware RAG Chatbot", layout="wide")
55
+ st.title("📊 Excel-Aware RAG Chatbot (Professional QA)")
56
+
57
+ with st.sidebar:
58
+ uploaded_file = st.file_uploader("Upload your Excel file (.xlsx or .xlsm with 'Data Base' sheet)", type=["xlsx", "xlsm"])
59
+
60
+ if uploaded_file is not None:
61
+ with st.spinner("Processing and indexing your Excel sheet..."):
62
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsm") as tmp_file:
63
+ tmp_file.write(uploaded_file.read())
64
+ tmp_path = tmp_file.name
65
+
66
  try:
67
  cleaned_df = preprocess_excel(tmp_path)
68
  vectorstore = build_vectorstore_from_dataframe(cleaned_df)
69
  qa = create_qa_pipeline(vectorstore)
70
+ st.success(" File processed and chatbot ready! Ask your questions below.")
71
 
72
  if "chat_history" not in st.session_state:
73
  st.session_state.chat_history = []