Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,12 +1,13 @@
|
|
1 |
import streamlit as st
|
2 |
from PyPDF2 import PdfReader
|
3 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
4 |
-
from
|
5 |
from langchain.vectorstores import FAISS
|
6 |
from langchain.chains import RetrievalQA
|
7 |
from langchain.llms import HuggingFacePipeline
|
8 |
import torch
|
9 |
from transformers import pipeline
|
|
|
10 |
|
11 |
# Load a smaller LLM (e.g., Zephyr-7B or Mistral-7B)
|
12 |
def load_llm():
|
@@ -30,17 +31,45 @@ def split_text(text, chunk_size=1000, chunk_overlap=200):
|
|
30 |
return chunks
|
31 |
|
32 |
# Create embeddings and vector store
|
33 |
-
def create_vector_store(chunks):
|
34 |
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
return vector_store
|
37 |
|
38 |
# Query the PDF
|
39 |
-
def query_pdf(vector_store, query, llm):
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
result = qa.run(query)
|
42 |
return result
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
# Streamlit App
|
45 |
def main():
|
46 |
st.title("Chat with PDF")
|
@@ -55,11 +84,25 @@ def main():
|
|
55 |
# Extract text
|
56 |
text = extract_text_from_pdf(uploaded_file)
|
57 |
|
|
|
|
|
|
|
|
|
58 |
# Split text into chunks
|
59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
# Create vector store
|
62 |
-
vector_store = create_vector_store(chunks)
|
63 |
|
64 |
# Load LLM
|
65 |
llm = load_llm()
|
@@ -67,15 +110,16 @@ def main():
|
|
67 |
# Query translation options
|
68 |
query_method = st.selectbox(
|
69 |
"Query Translation Method",
|
70 |
-
["
|
71 |
help="Choose a method to improve query retrieval."
|
72 |
)
|
|
|
73 |
|
74 |
# User input
|
75 |
query = st.text_input("Ask a question about the PDF:")
|
76 |
if query:
|
77 |
# Query the PDF
|
78 |
-
result = query_pdf(vector_store, query, llm)
|
79 |
st.write("**Answer:**", result["answer"])
|
80 |
st.write("**Source Text:**", result["source_text"])
|
81 |
|
|
|
1 |
import streamlit as st
|
2 |
from PyPDF2 import PdfReader
|
3 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
4 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings # Updated import
|
5 |
from langchain.vectorstores import FAISS
|
6 |
from langchain.chains import RetrievalQA
|
7 |
from langchain.llms import HuggingFacePipeline
|
8 |
import torch
|
9 |
from transformers import pipeline
|
10 |
+
from langdetect import detect
|
11 |
|
12 |
# Load a smaller LLM (e.g., Zephyr-7B or Mistral-7B)
|
13 |
def load_llm():
|
|
|
31 |
return chunks
|
32 |
|
33 |
# Create embeddings and vector store
|
34 |
+
def create_vector_store(chunks, indexing_method="multi-representation"):
|
35 |
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
36 |
+
if indexing_method == "multi-representation":
|
37 |
+
vector_store = FAISS.from_texts(chunks, embeddings)
|
38 |
+
elif indexing_method == "raptors":
|
39 |
+
# Implement RAPTORS logic here (e.g., hierarchical chunking)
|
40 |
+
vector_store = FAISS.from_texts(chunks, embeddings)
|
41 |
+
elif indexing_method == "colbert":
|
42 |
+
# Implement ColBERT logic here (e.g., contextualized embeddings)
|
43 |
+
vector_store = FAISS.from_texts(chunks, embeddings)
|
44 |
return vector_store
|
45 |
|
46 |
# Query the PDF
|
47 |
+
def query_pdf(vector_store, query, llm, query_method="multi-query"):
|
48 |
+
if query_method == "multi-query":
|
49 |
+
# Implement Multi-Query logic here
|
50 |
+
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store.as_retriever())
|
51 |
+
elif query_method == "rag-fusion":
|
52 |
+
# Implement RAG Fusion logic here
|
53 |
+
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store.as_retriever())
|
54 |
+
elif query_method == "decomposition":
|
55 |
+
# Implement Decomposition logic here
|
56 |
+
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store.as_retriever())
|
57 |
+
elif query_method == "step-back":
|
58 |
+
# Implement Step Back logic here
|
59 |
+
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store.as_retriever())
|
60 |
+
elif query_method == "hyde":
|
61 |
+
# Implement HyDE logic here
|
62 |
+
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store.as_retriever())
|
63 |
result = qa.run(query)
|
64 |
return result
|
65 |
|
66 |
+
# Detect language of the text
|
67 |
+
def detect_language(text):
|
68 |
+
try:
|
69 |
+
return detect(text)
|
70 |
+
except:
|
71 |
+
return "en" # Default to English if detection fails
|
72 |
+
|
73 |
# Streamlit App
|
74 |
def main():
|
75 |
st.title("Chat with PDF")
|
|
|
84 |
# Extract text
|
85 |
text = extract_text_from_pdf(uploaded_file)
|
86 |
|
87 |
+
# Detect language
|
88 |
+
language = detect_language(text)
|
89 |
+
st.write(f"Detected Language: {language}")
|
90 |
+
|
91 |
# Split text into chunks
|
92 |
+
chunk_size = st.slider("Chunk Size", 500, 2000, 1000)
|
93 |
+
chunk_overlap = st.slider("Chunk Overlap", 0, 500, 200)
|
94 |
+
chunks = split_text(text, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
95 |
+
|
96 |
+
# Indexing options
|
97 |
+
indexing_method = st.selectbox(
|
98 |
+
"Indexing Method",
|
99 |
+
["multi-representation", "raptors", "colbert"],
|
100 |
+
help="Choose how to index the PDF text."
|
101 |
+
)
|
102 |
+
st.write(f"**Indexing Method:** {indexing_method}")
|
103 |
|
104 |
# Create vector store
|
105 |
+
vector_store = create_vector_store(chunks, indexing_method=indexing_method)
|
106 |
|
107 |
# Load LLM
|
108 |
llm = load_llm()
|
|
|
110 |
# Query translation options
|
111 |
query_method = st.selectbox(
|
112 |
"Query Translation Method",
|
113 |
+
["multi-query", "rag-fusion", "decomposition", "step-back", "hyde"],
|
114 |
help="Choose a method to improve query retrieval."
|
115 |
)
|
116 |
+
st.write(f"**Query Translation Method:** {query_method}")
|
117 |
|
118 |
# User input
|
119 |
query = st.text_input("Ask a question about the PDF:")
|
120 |
if query:
|
121 |
# Query the PDF
|
122 |
+
result = query_pdf(vector_store, query, llm, query_method=query_method)
|
123 |
st.write("**Answer:**", result["answer"])
|
124 |
st.write("**Source Text:**", result["source_text"])
|
125 |
|