Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,19 +1,56 @@
|
|
1 |
import streamlit as st
|
2 |
import os
|
3 |
import requests
|
|
|
4 |
from langdetect import detect
|
5 |
from PyPDF2 import PdfReader
|
6 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
7 |
from sklearn.metrics.pairwise import cosine_similarity
|
8 |
from sklearn.neighbors import NearestNeighbors
|
9 |
import numpy as np
|
|
|
|
|
|
|
10 |
|
11 |
# Load the Hugging Face token from environment variables
|
12 |
-
huggingface_token = os.environ.get("Key2")
|
13 |
|
14 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
def query_huggingface_model(prompt, max_new_tokens=1000, temperature=0.7, top_k=50):
|
16 |
-
model_name = "HuggingFaceH4/zephyr-7b-alpha"
|
17 |
api_url = f"https://api-inference.huggingface.co/models/{model_name}"
|
18 |
headers = {"Authorization": f"Bearer {huggingface_token}"}
|
19 |
payload = {
|
@@ -24,206 +61,96 @@ def query_huggingface_model(prompt, max_new_tokens=1000, temperature=0.7, top_k=
|
|
24 |
"top_k": top_k,
|
25 |
},
|
26 |
}
|
27 |
-
response = requests.post(api_url, headers=headers, json=payload)
|
28 |
-
if response.status_code == 200:
|
29 |
-
return response.json()[0]["generated_text"]
|
30 |
-
else:
|
31 |
-
st.error(f"Error: {response.status_code} - {response.text}")
|
32 |
-
return None
|
33 |
-
|
34 |
-
# Function to detect language
|
35 |
-
def detect_language(text):
|
36 |
try:
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
lines = page.extract_text().split('\n')
|
47 |
-
for line_num, line in enumerate(lines):
|
48 |
-
text_data.append({
|
49 |
-
"page": page_num + 1,
|
50 |
-
"line": line_num + 1,
|
51 |
-
"content": line
|
52 |
-
})
|
53 |
-
return text_data
|
54 |
-
|
55 |
-
# Function to search for query in PDF content
|
56 |
-
def search_pdf_content(pdf_text_data, query):
|
57 |
-
results = []
|
58 |
-
for entry in pdf_text_data:
|
59 |
-
if query.lower() in entry["content"].lower():
|
60 |
-
results.append(entry)
|
61 |
-
return results
|
62 |
-
|
63 |
-
# Function to split text into chunks
|
64 |
-
def split_text_into_chunks(text, chunk_size=500):
|
65 |
-
words = text.split()
|
66 |
-
chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
|
67 |
-
return chunks
|
68 |
-
|
69 |
-
# Function to compute cosine similarity between query and document chunks
|
70 |
-
def compute_cosine_similarity(query, chunks):
|
71 |
-
vectorizer = TfidfVectorizer()
|
72 |
-
tfidf_matrix = vectorizer.fit_transform([query] + chunks)
|
73 |
-
cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
|
74 |
-
return cosine_similarities
|
75 |
-
|
76 |
-
# Function to find KNN-based similar documents
|
77 |
-
def find_knn_similar_documents(query, chunks, k=5):
|
78 |
-
vectorizer = TfidfVectorizer()
|
79 |
-
tfidf_matrix = vectorizer.fit_transform([query] + chunks)
|
80 |
-
knn = NearestNeighbors(n_neighbors=k, metric="cosine")
|
81 |
-
knn.fit(tfidf_matrix[1:])
|
82 |
-
distances, indices = knn.kneighbors(tfidf_matrix[0:1])
|
83 |
-
return indices.flatten(), distances.flatten()
|
84 |
|
85 |
-
#
|
86 |
-
|
87 |
-
"Multi-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
"
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
"
|
100 |
-
|
101 |
-
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
# Streamlit App
|
104 |
def main():
|
105 |
-
st.title("RAG Model with Advanced
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
st.sidebar.
|
110 |
-
|
111 |
-
|
112 |
-
st.sidebar.
|
113 |
-
|
114 |
-
|
115 |
-
#
|
116 |
-
st.
|
117 |
-
|
118 |
-
"Select Query Translation Method",
|
119 |
-
["Multi-Query", "RAG Fusion", "Decomposition", "Step Back", "HyDE"]
|
120 |
-
)
|
121 |
-
|
122 |
-
# Indexing Options
|
123 |
-
st.sidebar.header("Indexing")
|
124 |
-
indexing_method = st.sidebar.selectbox(
|
125 |
-
"Select Indexing Method",
|
126 |
-
["Multi-Representation", "Raptors", "ColBERT"]
|
127 |
-
)
|
128 |
-
|
129 |
-
# Similarity Search Options
|
130 |
-
st.sidebar.header("Similarity Search")
|
131 |
-
similarity_method = st.sidebar.selectbox(
|
132 |
-
"Select Similarity Search Method",
|
133 |
-
["Cosine Similarity", "KNN"]
|
134 |
-
)
|
135 |
-
if similarity_method == "KNN":
|
136 |
-
k_value = st.sidebar.slider("Select K Value", 1, 10, 5)
|
137 |
-
|
138 |
-
# LLM Parameters
|
139 |
-
st.sidebar.header("LLM Parameters")
|
140 |
-
max_new_tokens = st.sidebar.slider("Max New Tokens", 10, 1000, 1000)
|
141 |
-
temperature = st.sidebar.slider("Temperature", 0.1, 1.0, 0.7)
|
142 |
-
top_k = st.sidebar.slider("Top K", 1, 100, 50)
|
143 |
-
|
144 |
-
# System Prompt
|
145 |
-
st.sidebar.header("System Prompt")
|
146 |
-
default_system_prompt = DEFAULT_SYSTEM_PROMPTS[query_translation]
|
147 |
-
system_prompt = st.sidebar.text_area("System Prompt", default_system_prompt)
|
148 |
-
|
149 |
-
# Main Content
|
150 |
-
st.header("Input Prompt")
|
151 |
-
prompt = st.text_input("Enter your prompt:")
|
152 |
if prompt:
|
153 |
-
st.
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
if st.button("Apply Indexing"):
|
173 |
-
st.write(f"**Applied Indexing Method:** {indexing_method}")
|
174 |
-
if pdf_file is not None:
|
175 |
-
# Extract and search PDF content
|
176 |
-
pdf_text_data = extract_text_from_pdf(pdf_file)
|
177 |
-
search_results = search_pdf_content(pdf_text_data, prompt)
|
178 |
-
|
179 |
-
if search_results:
|
180 |
-
st.write("**Relevant Content from PDF:**")
|
181 |
-
for result in search_results:
|
182 |
-
st.write(f"**Page {result['page']}, Line {result['line']}:** {result['content']}")
|
183 |
-
|
184 |
-
# Split text into chunks
|
185 |
-
chunks = split_text_into_chunks("\n".join([result["content"] for result in search_results]))
|
186 |
-
st.write("**Chunks Obtained from PDF:**")
|
187 |
-
for i, chunk in enumerate(chunks):
|
188 |
-
st.write(f"**Chunk {i + 1}:** {chunk}")
|
189 |
-
|
190 |
-
# Perform similarity search
|
191 |
-
if similarity_method == "Cosine Similarity":
|
192 |
-
st.write("**Cosine Similarity Results:**")
|
193 |
-
cosine_similarities = compute_cosine_similarity(prompt, chunks)
|
194 |
-
for i, similarity in enumerate(cosine_similarities):
|
195 |
-
st.write(f"**Chunk {i + 1} Similarity:** {similarity:.4f}")
|
196 |
-
elif similarity_method == "KNN":
|
197 |
-
st.write(f"**KNN Results (k={k_value}):**")
|
198 |
-
indices, distances = find_knn_similar_documents(prompt, chunks, k_value)
|
199 |
-
for i, (index, distance) in enumerate(zip(indices, distances)):
|
200 |
-
st.write(f"**Chunk {index + 1} Distance:** {distance:.4f}")
|
201 |
else:
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
# Generate response based on PDF content
|
219 |
-
pdf_context = "\n".join([result["content"] for result in search_results])
|
220 |
-
response = query_huggingface_model(f"Based on the following context:\n{pdf_context}\n\nAnswer this question: {prompt}", max_new_tokens, temperature, top_k)
|
221 |
-
if response:
|
222 |
-
st.write("**Response:**", response)
|
223 |
else:
|
224 |
-
st.
|
225 |
else:
|
226 |
-
st.
|
227 |
|
228 |
if __name__ == "__main__":
|
229 |
main()
|
|
|
1 |
import streamlit as st
|
2 |
import os
|
3 |
import requests
|
4 |
+
import re
|
5 |
from langdetect import detect
|
6 |
from PyPDF2 import PdfReader
|
7 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
8 |
from sklearn.metrics.pairwise import cosine_similarity
|
9 |
from sklearn.neighbors import NearestNeighbors
|
10 |
import numpy as np
|
11 |
+
from sentence_transformers import SentenceTransformer
|
12 |
+
import faiss
|
13 |
+
import hashlib
|
14 |
|
15 |
# Load the Hugging Face token from environment variables
|
16 |
+
huggingface_token = os.environ.get("Key2")
|
17 |
|
18 |
+
# Initialize Sentence Transformer model for better embeddings
|
19 |
+
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
|
20 |
+
|
21 |
+
# Cache PDF extraction
|
22 |
+
@st.cache_data
|
23 |
+
def extract_text_from_pdf(pdf_file):
|
24 |
+
pdf_reader = PdfReader(pdf_file)
|
25 |
+
text_data = []
|
26 |
+
for page_num, page in enumerate(pdf_reader.pages):
|
27 |
+
text = page.extract_text()
|
28 |
+
text = re.sub(r'\s+', ' ', text) # Clean extra whitespace
|
29 |
+
text_data.append({
|
30 |
+
"page": page_num + 1,
|
31 |
+
"content": text
|
32 |
+
})
|
33 |
+
return text_data
|
34 |
+
|
35 |
+
# Enhanced text chunking with overlap
|
36 |
+
def split_text_into_chunks(text, chunk_size=500, overlap=100):
|
37 |
+
words = text.split()
|
38 |
+
chunks = []
|
39 |
+
for i in range(0, len(words), chunk_size - overlap):
|
40 |
+
chunks.append(" ".join(words[i:i + chunk_size]))
|
41 |
+
return chunks
|
42 |
+
|
43 |
+
# Enhanced semantic search using sentence transformers
|
44 |
+
def semantic_search(query, chunks, threshold=0.3):
|
45 |
+
query_embedding = sentence_model.encode([query])
|
46 |
+
chunk_embeddings = sentence_model.encode(chunks)
|
47 |
+
similarities = cosine_similarity(query_embedding, chunk_embeddings)[0]
|
48 |
+
results = [(chunks[i], similarities[i]) for i in np.argsort(similarities)[::-1]]
|
49 |
+
return [res for res in results if res[1] > threshold]
|
50 |
+
|
51 |
+
# Improved query translation with error handling
|
52 |
def query_huggingface_model(prompt, max_new_tokens=1000, temperature=0.7, top_k=50):
|
53 |
+
model_name = "HuggingFaceH4/zephyr-7b-alpha"
|
54 |
api_url = f"https://api-inference.huggingface.co/models/{model_name}"
|
55 |
headers = {"Authorization": f"Bearer {huggingface_token}"}
|
56 |
payload = {
|
|
|
61 |
"top_k": top_k,
|
62 |
},
|
63 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
try:
|
65 |
+
response = requests.post(api_url, headers=headers, json=payload, timeout=30)
|
66 |
+
if response.status_code == 200:
|
67 |
+
return response.json()[0]["generated_text"]
|
68 |
+
else:
|
69 |
+
st.error(f"API Error: {response.status_code}")
|
70 |
+
return None
|
71 |
+
except Exception as e:
|
72 |
+
st.error(f"Connection Error: {str(e)}")
|
73 |
+
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
+
# Enhanced indexing strategies
|
76 |
+
def create_index(text_chunks, method="Multi-Representation"):
|
77 |
+
if method == "Multi-Representation":
|
78 |
+
return TfidfVectorizer().fit_transform(text_chunks)
|
79 |
+
elif method == "Raptors":
|
80 |
+
embeddings = sentence_model.encode(text_chunks)
|
81 |
+
index = faiss.IndexFlatL2(embeddings.shape[1])
|
82 |
+
index.add(embeddings)
|
83 |
+
return index
|
84 |
+
elif method == "ColBERT":
|
85 |
+
return sentence_model.encode(text_chunks)
|
86 |
+
|
87 |
+
# Improved similarity search with multiple methods
|
88 |
+
def similarity_search(query, chunks, method="Cosine", index=None, k=5):
|
89 |
+
if method == "Cosine":
|
90 |
+
return semantic_search(query, chunks)
|
91 |
+
elif method == "KNN":
|
92 |
+
if isinstance(index, faiss.IndexFlatL2):
|
93 |
+
query_embedding = sentence_model.encode([query])
|
94 |
+
distances, indices = index.search(query_embedding, k)
|
95 |
+
return [(chunks[i], 1 - distances[0][j]) for j, i in enumerate(indices[0])]
|
96 |
+
return []
|
97 |
|
98 |
# Streamlit App
|
99 |
def main():
|
100 |
+
st.title("Enhanced RAG Model with Advanced Features")
|
101 |
+
|
102 |
+
# Sidebar configurations
|
103 |
+
st.sidebar.title("Configuration")
|
104 |
+
pdf_file = st.sidebar.file_uploader("Upload PDF", type="pdf")
|
105 |
+
query_translation = st.sidebar.selectbox("Query Translation", list(DEFAULT_SYSTEM_PROMPTS.keys()))
|
106 |
+
indexing_method = st.sidebar.selectbox("Indexing Method", ["Multi-Representation", "Raptors", "ColBERT"])
|
107 |
+
similarity_method = st.sidebar.selectbox("Similarity Search", ["Cosine", "KNN"])
|
108 |
+
similarity_threshold = st.sidebar.slider("Similarity Threshold", 0.0, 1.0, 0.3)
|
109 |
+
|
110 |
+
# Main interface
|
111 |
+
prompt = st.text_input("Enter your query:")
|
112 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
if prompt:
|
114 |
+
with st.spinner("Processing..."):
|
115 |
+
# Query Translation
|
116 |
+
translated_prompt = query_huggingface_model(
|
117 |
+
DEFAULT_SYSTEM_PROMPTS[query_translation].format(question=prompt)
|
118 |
+
)
|
119 |
+
|
120 |
+
if pdf_file:
|
121 |
+
# Process PDF
|
122 |
+
text_data = extract_text_from_pdf(pdf_file)
|
123 |
+
full_text = " ".join([p["content"] for p in text_data])
|
124 |
+
chunks = split_text_into_chunks(full_text)
|
125 |
+
|
126 |
+
# Create index
|
127 |
+
index = create_index(chunks, indexing_method)
|
128 |
+
|
129 |
+
# Perform search
|
130 |
+
if query_translation == "HyDE":
|
131 |
+
hypothetical_answer = translated_prompt
|
132 |
+
results = semantic_search(hypothetical_answer, chunks, similarity_threshold)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
else:
|
134 |
+
results = similarity_search(prompt, chunks, similarity_method, index)
|
135 |
+
|
136 |
+
# Display results
|
137 |
+
if results:
|
138 |
+
st.subheader("Top Results:")
|
139 |
+
for i, (chunk, score) in enumerate(results[:3]):
|
140 |
+
st.markdown(f"**Result {i+1}** (Score: {score:.2f}):")
|
141 |
+
st.write(chunk)
|
142 |
+
|
143 |
+
# Generate response
|
144 |
+
context = "\n".join([chunk for chunk, _ in results[:3]])
|
145 |
+
response = query_huggingface_model(
|
146 |
+
f"Context: {context}\n\nQuestion: {prompt}\n\nAnswer:"
|
147 |
+
)
|
148 |
+
st.subheader("Generated Response:")
|
149 |
+
st.write(response)
|
|
|
|
|
|
|
|
|
|
|
150 |
else:
|
151 |
+
st.warning("No relevant documents found matching the query.")
|
152 |
else:
|
153 |
+
st.error("Please upload a PDF document first.")
|
154 |
|
155 |
if __name__ == "__main__":
|
156 |
main()
|