bainskarman commited on
Commit
9d72b0b
·
verified ·
1 Parent(s): 06b340c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -9
app.py CHANGED
@@ -1,12 +1,13 @@
1
  import streamlit as st
2
  from PyPDF2 import PdfReader
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
- from langchain.embeddings import HuggingFaceEmbeddings
5
  from langchain.vectorstores import FAISS
6
  from langchain.chains import RetrievalQA
7
  from langchain.llms import HuggingFacePipeline
8
  import torch
9
  from transformers import pipeline
 
10
 
11
  # Load a smaller LLM (e.g., Zephyr-7B or Mistral-7B)
12
  def load_llm():
@@ -30,17 +31,45 @@ def split_text(text, chunk_size=1000, chunk_overlap=200):
30
  return chunks
31
 
32
  # Create embeddings and vector store
33
- def create_vector_store(chunks):
34
  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
35
- vector_store = FAISS.from_texts(chunks, embeddings)
 
 
 
 
 
 
 
36
  return vector_store
37
 
38
  # Query the PDF
39
- def query_pdf(vector_store, query, llm):
40
- qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store.as_retriever())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  result = qa.run(query)
42
  return result
43
 
 
 
 
 
 
 
 
44
  # Streamlit App
45
  def main():
46
  st.title("Chat with PDF")
@@ -55,11 +84,25 @@ def main():
55
  # Extract text
56
  text = extract_text_from_pdf(uploaded_file)
57
 
 
 
 
 
58
  # Split text into chunks
59
- chunks = split_text(text)
 
 
 
 
 
 
 
 
 
 
60
 
61
  # Create vector store
62
- vector_store = create_vector_store(chunks)
63
 
64
  # Load LLM
65
  llm = load_llm()
@@ -67,15 +110,16 @@ def main():
67
  # Query translation options
68
  query_method = st.selectbox(
69
  "Query Translation Method",
70
- ["Multi-Query", "RAG Fusion", "Decomposition", "Step Back", "HyDE"],
71
  help="Choose a method to improve query retrieval."
72
  )
 
73
 
74
  # User input
75
  query = st.text_input("Ask a question about the PDF:")
76
  if query:
77
  # Query the PDF
78
- result = query_pdf(vector_store, query, llm)
79
  st.write("**Answer:**", result["answer"])
80
  st.write("**Source Text:**", result["source_text"])
81
 
 
1
  import streamlit as st
2
  from PyPDF2 import PdfReader
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain_community.embeddings import HuggingFaceEmbeddings # Updated import
5
  from langchain.vectorstores import FAISS
6
  from langchain.chains import RetrievalQA
7
  from langchain.llms import HuggingFacePipeline
8
  import torch
9
  from transformers import pipeline
10
+ from langdetect import detect
11
 
12
  # Load a smaller LLM (e.g., Zephyr-7B or Mistral-7B)
13
  def load_llm():
 
31
  return chunks
32
 
33
  # Create embeddings and vector store
34
+ def create_vector_store(chunks, indexing_method="multi-representation"):
35
  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
36
+ if indexing_method == "multi-representation":
37
+ vector_store = FAISS.from_texts(chunks, embeddings)
38
+ elif indexing_method == "raptors":
39
+ # Implement RAPTORS logic here (e.g., hierarchical chunking)
40
+ vector_store = FAISS.from_texts(chunks, embeddings)
41
+ elif indexing_method == "colbert":
42
+ # Implement ColBERT logic here (e.g., contextualized embeddings)
43
+ vector_store = FAISS.from_texts(chunks, embeddings)
44
  return vector_store
45
 
46
  # Query the PDF
47
+ def query_pdf(vector_store, query, llm, query_method="multi-query"):
48
+ if query_method == "multi-query":
49
+ # Implement Multi-Query logic here
50
+ qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store.as_retriever())
51
+ elif query_method == "rag-fusion":
52
+ # Implement RAG Fusion logic here
53
+ qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store.as_retriever())
54
+ elif query_method == "decomposition":
55
+ # Implement Decomposition logic here
56
+ qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store.as_retriever())
57
+ elif query_method == "step-back":
58
+ # Implement Step Back logic here
59
+ qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store.as_retriever())
60
+ elif query_method == "hyde":
61
+ # Implement HyDE logic here
62
+ qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store.as_retriever())
63
  result = qa.run(query)
64
  return result
65
 
66
+ # Detect language of the text
67
+ def detect_language(text):
68
+ try:
69
+ return detect(text)
70
+ except:
71
+ return "en" # Default to English if detection fails
72
+
73
  # Streamlit App
74
  def main():
75
  st.title("Chat with PDF")
 
84
  # Extract text
85
  text = extract_text_from_pdf(uploaded_file)
86
 
87
+ # Detect language
88
+ language = detect_language(text)
89
+ st.write(f"Detected Language: {language}")
90
+
91
  # Split text into chunks
92
+ chunk_size = st.slider("Chunk Size", 500, 2000, 1000)
93
+ chunk_overlap = st.slider("Chunk Overlap", 0, 500, 200)
94
+ chunks = split_text(text, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
95
+
96
+ # Indexing options
97
+ indexing_method = st.selectbox(
98
+ "Indexing Method",
99
+ ["multi-representation", "raptors", "colbert"],
100
+ help="Choose how to index the PDF text."
101
+ )
102
+ st.write(f"**Indexing Method:** {indexing_method}")
103
 
104
  # Create vector store
105
+ vector_store = create_vector_store(chunks, indexing_method=indexing_method)
106
 
107
  # Load LLM
108
  llm = load_llm()
 
110
  # Query translation options
111
  query_method = st.selectbox(
112
  "Query Translation Method",
113
+ ["multi-query", "rag-fusion", "decomposition", "step-back", "hyde"],
114
  help="Choose a method to improve query retrieval."
115
  )
116
+ st.write(f"**Query Translation Method:** {query_method}")
117
 
118
  # User input
119
  query = st.text_input("Ask a question about the PDF:")
120
  if query:
121
  # Query the PDF
122
+ result = query_pdf(vector_store, query, llm, query_method=query_method)
123
  st.write("**Answer:**", result["answer"])
124
  st.write("**Source Text:**", result["source_text"])
125