pradeepsengarr commited on
Commit
c46f62c
Β·
verified Β·
1 Parent(s): 1b0749c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +131 -131
app.py CHANGED
@@ -1,137 +1,137 @@
1
- import os
2
- import streamlit as st
3
- import fitz # PyMuPDF
4
- import logging
5
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
6
- from langchain.text_splitter import RecursiveCharacterTextSplitter
7
- from langchain_community.vectorstores import Chroma
8
- from langchain_community.embeddings import SentenceTransformerEmbeddings
9
- from langchain_community.llms import HuggingFacePipeline
10
- from langchain.chains import RetrievalQA
11
- from langchain.prompts import PromptTemplate
12
- from langchain_community.document_loaders import TextLoader
13
-
14
- # --- Configuration ---
15
- st.set_page_config(page_title="πŸ“š RAG PDF Chatbot", layout="wide")
16
- st.title("πŸ“š RAG-based PDF Chatbot")
17
- device = "cpu"
18
-
19
- # --- Logging ---
20
- logging.basicConfig(level=logging.INFO)
21
-
22
- # --- Load LLM ---
23
- @st.cache_resource
24
- def load_model():
25
- checkpoint = "MBZUAI/LaMini-T5-738M"
26
- tokenizer = AutoTokenizer.from_pretrained(checkpoint)
27
- model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
28
- pipe = pipeline('text2text-generation', model=model, tokenizer=tokenizer, max_length=1024, do_sample=True, temperature=0.3, top_k=50, top_p=0.95)
29
- return HuggingFacePipeline(pipeline=pipe)
30
-
31
- # --- Extract PDF Text ---
32
- def read_pdf(file):
33
- try:
34
- doc = fitz.open(stream=file.read(), filetype="pdf")
35
- text = ""
36
- for page in doc:
37
- text += page.get_text()
38
- return text.strip()
39
- except Exception as e:
40
- logging.error(f"Failed to extract text: {e}")
41
- return ""
42
-
43
- # --- Process Answer ---dd
44
- def process_answer(question, full_text):
45
- # Save the full_text to a temporary file
46
- with open("temp_text.txt", "w") as f:
47
- f.write(full_text)
48
-
49
- loader = TextLoader("temp_text.txt")
50
- docs = loader.load()
51
-
52
- # Chunk the documents with increased size and overlap
53
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=300)
54
- splits = text_splitter.split_documents(docs)
55
-
56
- # Load embeddings
57
- embeddings = SentenceTransformerEmbeddings(model_name="BAAI/bge-base-en-v1.5")
58
-
59
- # Create Chroma in-memory vector store
60
- db = Chroma.from_documents(splits, embedding=embeddings)
61
- retriever = db.as_retriever()
62
-
63
- # Set up the model
64
- llm = load_model()
65
-
66
- # Create a custom prompt
67
- prompt_template = PromptTemplate(
68
- input_variables=["context", "question"],
69
- template="""
70
- You are a helpful assistant. Carefully analyze the given context and extract direct answers ONLY from it.
71
 
72
- Context:
73
- {context}
74
 
75
- Question:
76
- {question}
77
 
78
- Important Instructions:
79
- - If the question asks for a URL (e.g., LinkedIn link), provide the exact URL as it appears.
80
- - Do NOT summarize or paraphrase.
81
- - If the information is not in the context, say "Not found in the document."
82
 
83
- Answer:
84
- """)
85
 
86
 
87
- # Retrieval QA with custom prompt
88
- qa_chain = RetrievalQA.from_chain_type(
89
- llm=llm,
90
- retriever=retriever,
91
- chain_type="stuff",
92
- chain_type_kwargs={"prompt": prompt_template}
93
- )
94
-
95
- # Return the answer using the retrieval QA chain
96
- return qa_chain.run(question)
97
-
98
- # --- UI Layout ---
99
- with st.sidebar:
100
- st.header("πŸ“„ Upload PDF")
101
- uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"])
102
-
103
- # --- Main Interface ---
104
- if uploaded_file:
105
- st.success(f"You uploaded: {uploaded_file.name}")
106
- full_text = read_pdf(uploaded_file)
107
-
108
- if full_text:
109
- st.subheader("πŸ“ PDF Preview")
110
- with st.expander("View Extracted Text"):
111
- st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
112
-
113
- st.subheader("πŸ’¬ Ask a Question")
114
- user_question = st.text_input("Type your question about the PDF content")
115
-
116
- if user_question:
117
- with st.spinner("Thinking..."):
118
- answer = process_answer(user_question, full_text)
119
- st.markdown("### πŸ€– Answer")
120
- st.write(answer)
121
-
122
- with st.sidebar:
123
- st.markdown("---")
124
- st.markdown("**πŸ’‘ Suggestions:**")
125
- st.caption("Try: \"Summarize this document\" or \"What is the key idea?\"")
126
- with st.expander("πŸ’‘ Suggestions", expanded=True):
127
- st.markdown("""
128
- - "Summarize this document"
129
- - "Give a quick summary"
130
- - "What are the main points?"
131
- - "Explain this document in short"
132
- """)
133
-
134
- else:
135
- st.error("⚠️ No text could be extracted from the PDF. Try another file.")
136
- else:
137
- st.info("Upload a PDF to begin.")
 
1
+ # import os
2
+ # import streamlit as st
3
+ # import fitz # PyMuPDF
4
+ # import logging
5
+ # from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
6
+ # from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ # from langchain_community.vectorstores import Chroma
8
+ # from langchain_community.embeddings import SentenceTransformerEmbeddings
9
+ # from langchain_community.llms import HuggingFacePipeline
10
+ # from langchain.chains import RetrievalQA
11
+ # from langchain.prompts import PromptTemplate
12
+ # from langchain_community.document_loaders import TextLoader
13
+
14
+ # # --- Configuration ---
15
+ # st.set_page_config(page_title="πŸ“š RAG PDF Chatbot", layout="wide")
16
+ # st.title("πŸ“š RAG-based PDF Chatbot")
17
+ # device = "cpu"
18
+
19
+ # # --- Logging ---
20
+ # logging.basicConfig(level=logging.INFO)
21
+
22
+ # # --- Load LLM ---
23
+ # @st.cache_resource
24
+ # def load_model():
25
+ # checkpoint = "MBZUAI/LaMini-T5-738M"
26
+ # tokenizer = AutoTokenizer.from_pretrained(checkpoint)
27
+ # model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
28
+ # pipe = pipeline('text2text-generation', model=model, tokenizer=tokenizer, max_length=1024, do_sample=True, temperature=0.3, top_k=50, top_p=0.95)
29
+ # return HuggingFacePipeline(pipeline=pipe)
30
+
31
+ # # --- Extract PDF Text ---
32
+ # def read_pdf(file):
33
+ # try:
34
+ # doc = fitz.open(stream=file.read(), filetype="pdf")
35
+ # text = ""
36
+ # for page in doc:
37
+ # text += page.get_text()
38
+ # return text.strip()
39
+ # except Exception as e:
40
+ # logging.error(f"Failed to extract text: {e}")
41
+ # return ""
42
+
43
+ # # --- Process Answer ---dd
44
+ # def process_answer(question, full_text):
45
+ # # Save the full_text to a temporary file
46
+ # with open("temp_text.txt", "w") as f:
47
+ # f.write(full_text)
48
+
49
+ # loader = TextLoader("temp_text.txt")
50
+ # docs = loader.load()
51
+
52
+ # # Chunk the documents with increased size and overlap
53
+ # text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=300)
54
+ # splits = text_splitter.split_documents(docs)
55
+
56
+ # # Load embeddings
57
+ # embeddings = SentenceTransformerEmbeddings(model_name="BAAI/bge-base-en-v1.5")
58
+
59
+ # # Create Chroma in-memory vector store
60
+ # db = Chroma.from_documents(splits, embedding=embeddings)
61
+ # retriever = db.as_retriever()
62
+
63
+ # # Set up the model
64
+ # llm = load_model()
65
+
66
+ # # Create a custom prompt
67
+ # prompt_template = PromptTemplate(
68
+ # input_variables=["context", "question"],
69
+ # template="""
70
+ # You are a helpful assistant. Carefully analyze the given context and extract direct answers ONLY from it.
71
 
72
+ # Context:
73
+ # {context}
74
 
75
+ # Question:
76
+ # {question}
77
 
78
+ # Important Instructions:
79
+ # - If the question asks for a URL (e.g., LinkedIn link), provide the exact URL as it appears.
80
+ # - Do NOT summarize or paraphrase.
81
+ # - If the information is not in the context, say "Not found in the document."
82
 
83
+ # Answer:
84
+ # """)
85
 
86
 
87
+ # # Retrieval QA with custom prompt
88
+ # qa_chain = RetrievalQA.from_chain_type(
89
+ # llm=llm,
90
+ # retriever=retriever,
91
+ # chain_type="stuff",
92
+ # chain_type_kwargs={"prompt": prompt_template}
93
+ # )
94
+
95
+ # # Return the answer using the retrieval QA chain
96
+ # return qa_chain.run(question)
97
+
98
+ # # --- UI Layout ---
99
+ # with st.sidebar:
100
+ # st.header("πŸ“„ Upload PDF")
101
+ # uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"])
102
+
103
+ # # --- Main Interface ---
104
+ # if uploaded_file:
105
+ # st.success(f"You uploaded: {uploaded_file.name}")
106
+ # full_text = read_pdf(uploaded_file)
107
+
108
+ # if full_text:
109
+ # st.subheader("πŸ“ PDF Preview")
110
+ # with st.expander("View Extracted Text"):
111
+ # st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
112
+
113
+ # st.subheader("πŸ’¬ Ask a Question")
114
+ # user_question = st.text_input("Type your question about the PDF content")
115
+
116
+ # if user_question:
117
+ # with st.spinner("Thinking..."):
118
+ # answer = process_answer(user_question, full_text)
119
+ # st.markdown("### πŸ€– Answer")
120
+ # st.write(answer)
121
+
122
+ # with st.sidebar:
123
+ # st.markdown("---")
124
+ # st.markdown("**πŸ’‘ Suggestions:**")
125
+ # st.caption("Try: \"Summarize this document\" or \"What is the key idea?\"")
126
+ # with st.expander("πŸ’‘ Suggestions", expanded=True):
127
+ # st.markdown("""
128
+ # - "Summarize this document"
129
+ # - "Give a quick summary"
130
+ # - "What are the main points?"
131
+ # - "Explain this document in short"
132
+ # """)
133
+
134
+ # else:
135
+ # st.error("⚠️ No text could be extracted from the PDF. Try another file.")
136
+ # else:
137
+ # st.info("Upload a PDF to begin.")