bainskarman commited on
Commit
5f45885
·
verified ·
1 Parent(s): 15db503

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +201 -66
app.py CHANGED
@@ -1,25 +1,21 @@
1
  import streamlit as st
2
  import os
 
3
  from langdetect import detect
4
  from PyPDF2 import PdfReader
5
- import requests
6
- from sentence_transformers import SentenceTransformer
7
- import faiss
8
  import numpy as np
9
 
10
- # Load the API key from Streamlit secrets
11
- API_KEY = st.secrets["Key2"]
12
- API_URL = "https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-alpha"
13
 
14
- # Load the embedding model for semantic search
15
- embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
16
-
17
- # Function to query the LLM via Hugging Face Inference API
18
- def query_llm_api(prompt, max_new_tokens=1000, temperature=0.7, top_k=50):
19
- headers = {
20
- "Authorization": f"Bearer {API_KEY}",
21
- "Content-Type": "application/json",
22
- }
23
  payload = {
24
  "inputs": prompt,
25
  "parameters": {
@@ -28,18 +24,18 @@ def query_llm_api(prompt, max_new_tokens=1000, temperature=0.7, top_k=50):
28
  "top_k": top_k,
29
  },
30
  }
31
- response = requests.post(API_URL, headers=headers, json=payload)
32
  if response.status_code == 200:
33
- return response.json()["generated_text"]
34
  else:
35
- st.error(f"Error querying the API: {response.status_code}, {response.text}")
36
  return None
37
 
38
  # Function to detect language
39
  def detect_language(text):
40
  try:
41
  return detect(text)
42
- except Exception:
43
  return "en" # Default to English if detection fails
44
 
45
  # Function to extract text from PDF with line and page numbers
@@ -47,52 +43,191 @@ def extract_text_from_pdf(pdf_file):
47
  pdf_reader = PdfReader(pdf_file)
48
  text_data = []
49
  for page_num, page in enumerate(pdf_reader.pages):
50
- if page.extract_text():
51
- lines = page.extract_text().split('\n')
52
- for line_num, line in enumerate(lines):
53
- text_data.append({
54
- "page": page_num + 1,
55
- "line": line_num + 1,
56
- "content": line
57
- })
58
  return text_data
59
 
60
- # Function to create embeddings for the PDF text
61
- def get_embeddings(text_data):
62
- texts = [entry['content'] for entry in text_data]
63
- return embedding_model.encode(texts, convert_to_tensor=False)
64
-
65
- # Function to perform KNN or cosine similarity search
66
- def search_pdf_content(pdf_text_data, query, search_type="knn", k=5):
67
- query_embedding = embedding_model.encode([query])[0]
68
- pdf_embeddings = get_embeddings(pdf_text_data)
69
-
70
- if search_type == "knn":
71
- index = faiss.IndexFlatL2(pdf_embeddings.shape[1])
72
- index.add(pdf_embeddings.astype('float32'))
73
- distances, indices = index.search(np.array([query_embedding], dtype='float32'), k)
74
- return [pdf_text_data[i] for i in indices[0]]
75
-
76
- elif search_type == "cosine":
77
- pdf_embeddings_norm = pdf_embeddings / np.linalg.norm(pdf_embeddings, axis=1, keepdims=True)
78
- query_embedding_norm = query_embedding / np.linalg.norm(query_embedding)
79
- similarities = np.dot(pdf_embeddings_norm, query_embedding_norm)
80
- top_indices = np.argsort(similarities)[-k:][::-1]
81
- return [pdf_text_data[i] for i in top_indices]
82
-
83
- # Streamlit UI
84
- st.title("PDF Search with LLM and Semantic Search")
85
-
86
- pdf_file = st.file_uploader("Upload a PDF file", type="pdf")
87
- search_query = st.text_input("Enter your search query")
88
-
89
- search_method = st.radio("Select Search Method", ("knn", "cosine"))
90
- k_value = st.slider("Number of Results (K)", min_value=1, max_value=20, value=5)
91
-
92
- if pdf_file and search_query:
93
- pdf_text_data = extract_text_from_pdf(pdf_file)
94
- results = search_pdf_content(pdf_text_data, search_query, search_type=search_method, k=k_value)
95
-
96
- st.write("### Search Results")
97
- for res in results:
98
- st.write(f"**Page {res['page']}, Line {res['line']}:** {res['content']}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import os
3
+ import requests
4
  from langdetect import detect
5
  from PyPDF2 import PdfReader
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+ from sklearn.neighbors import NearestNeighbors
9
  import numpy as np
10
 
11
+ # Load the Hugging Face token from environment variables
12
+ huggingface_token = os.environ.get("HUGGINGFACE_TOKEN") # Replace with your Hugging Face token
 
13
 
14
+ # Function to query the Hugging Face API
15
+ def query_huggingface_model(prompt, max_new_tokens=1000, temperature=0.7, top_k=50):
16
+ model_name = "HuggingFaceH4/zephyr-7b-alpha" # Replace with your preferred model
17
+ api_url = f"https://api-inference.huggingface.co/models/{model_name}"
18
+ headers = {"Authorization": f"Bearer {huggingface_token}"}
 
 
 
 
19
  payload = {
20
  "inputs": prompt,
21
  "parameters": {
 
24
  "top_k": top_k,
25
  },
26
  }
27
+ response = requests.post(api_url, headers=headers, json=payload)
28
  if response.status_code == 200:
29
+ return response.json()[0]["generated_text"]
30
  else:
31
+ st.error(f"Error: {response.status_code} - {response.text}")
32
  return None
33
 
34
  # Function to detect language
35
  def detect_language(text):
36
  try:
37
  return detect(text)
38
+ except:
39
  return "en" # Default to English if detection fails
40
 
41
  # Function to extract text from PDF with line and page numbers
 
43
  pdf_reader = PdfReader(pdf_file)
44
  text_data = []
45
  for page_num, page in enumerate(pdf_reader.pages):
46
+ lines = page.extract_text().split('\n')
47
+ for line_num, line in enumerate(lines):
48
+ text_data.append({
49
+ "page": page_num + 1,
50
+ "line": line_num + 1,
51
+ "content": line
52
+ })
 
53
  return text_data
54
 
55
+ # Function to search for query in PDF content
56
+ def search_pdf_content(pdf_text_data, query):
57
+ results = []
58
+ for entry in pdf_text_data:
59
+ if query.lower() in entry["content"].lower():
60
+ results.append(entry)
61
+ return results
62
+
63
+ # Function to split text into chunks
64
+ def split_text_into_chunks(text, chunk_size=500):
65
+ words = text.split()
66
+ chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
67
+ return chunks
68
+
69
+ # Function to compute cosine similarity between query and document chunks
70
+ def compute_cosine_similarity(query, chunks):
71
+ vectorizer = TfidfVectorizer()
72
+ tfidf_matrix = vectorizer.fit_transform([query] + chunks)
73
+ cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
74
+ return cosine_similarities
75
+
76
+ # Function to find KNN-based similar documents
77
+ def find_knn_similar_documents(query, chunks, k=5):
78
+ vectorizer = TfidfVectorizer()
79
+ tfidf_matrix = vectorizer.fit_transform([query] + chunks)
80
+ knn = NearestNeighbors(n_neighbors=k, metric="cosine")
81
+ knn.fit(tfidf_matrix[1:])
82
+ distances, indices = knn.kneighbors(tfidf_matrix[0:1])
83
+ return indices.flatten(), distances.flatten()
84
+
85
+ # Default system prompts for each query translation method
86
+ DEFAULT_SYSTEM_PROMPTS = {
87
+ "Multi-Query": """You are an AI language model assistant. Your task is to generate five
88
+ different versions of the given user question to retrieve relevant documents from a vector
89
+ database. By generating multiple perspectives on the user question, your goal is to help
90
+ the user overcome some of the limitations of the distance-based similarity search.
91
+ Provide these alternative questions separated by newlines. Original question: {question}""",
92
+ "RAG Fusion": """You are an AI language model assistant. Your task is to combine multiple
93
+ queries into a single, refined query to improve retrieval accuracy. Original question: {question}""",
94
+ "Decomposition": """You are an AI language model assistant. Your task is to break down
95
+ the given user question into simpler sub-questions. Provide these sub-questions separated
96
+ by newlines. Original question: {question}""",
97
+ "Step Back": """You are an AI language model assistant. Your task is to refine the given
98
+ user question by taking a step back and asking a more general question. Original question: {question}""",
99
+ "HyDE": """You are an AI language model assistant. Your task is to generate a hypothetical
100
+ document that would be relevant to the given user question. Original question: {question}""",
101
+ }
102
+
103
+ # Streamlit App
104
+ def main():
105
+ st.title("RAG Model with Advanced Query Translation and Indexing")
106
+ st.write("Enter a prompt and get a response from the model.")
107
+
108
+ # Sidebar for options
109
+ st.sidebar.title("Options")
110
+
111
+ # PDF Upload
112
+ st.sidebar.header("Upload PDF")
113
+ pdf_file = st.sidebar.file_uploader("Upload a PDF file", type="pdf")
114
+
115
+ # Query Translation Options
116
+ st.sidebar.header("Query Translation")
117
+ query_translation = st.sidebar.selectbox(
118
+ "Select Query Translation Method",
119
+ ["Multi-Query", "RAG Fusion", "Decomposition", "Step Back", "HyDE"]
120
+ )
121
+
122
+ # Indexing Options
123
+ st.sidebar.header("Indexing")
124
+ indexing_method = st.sidebar.selectbox(
125
+ "Select Indexing Method",
126
+ ["Multi-Representation", "Raptors", "ColBERT"]
127
+ )
128
+
129
+ # Similarity Search Options
130
+ st.sidebar.header("Similarity Search")
131
+ similarity_method = st.sidebar.selectbox(
132
+ "Select Similarity Search Method",
133
+ ["Cosine Similarity", "KNN"]
134
+ )
135
+ if similarity_method == "KNN":
136
+ k_value = st.sidebar.slider("Select K Value", 1, 10, 5)
137
+
138
+ # LLM Parameters
139
+ st.sidebar.header("LLM Parameters")
140
+ max_new_tokens = st.sidebar.slider("Max New Tokens", 10, 1000, 1000)
141
+ temperature = st.sidebar.slider("Temperature", 0.1, 1.0, 0.7)
142
+ top_k = st.sidebar.slider("Top K", 1, 100, 50)
143
+
144
+ # System Prompt
145
+ st.sidebar.header("System Prompt")
146
+ default_system_prompt = DEFAULT_SYSTEM_PROMPTS[query_translation]
147
+ system_prompt = st.sidebar.text_area("System Prompt", default_system_prompt)
148
+
149
+ # Main Content
150
+ st.header("Input Prompt")
151
+ prompt = st.text_input("Enter your prompt:")
152
+ if prompt:
153
+ st.write("**Prompt:**", prompt)
154
+
155
+ # Detect Language
156
+ language = detect_language(prompt)
157
+ st.write(f"**Detected Language:** {language}")
158
+
159
+ # Query Translation
160
+ if st.button("Apply Query Translation"):
161
+ st.write(f"**Applied Query Translation Method:** {query_translation}")
162
+ # Format the system prompt with the user's question
163
+ formatted_prompt = system_prompt.format(question=prompt)
164
+ st.write("**Formatted System Prompt:**", formatted_prompt)
165
+
166
+ # Query the Hugging Face model for query translation
167
+ translated_queries = query_huggingface_model(formatted_prompt, max_new_tokens, temperature, top_k)
168
+ if translated_queries:
169
+ st.write("**Translated Queries:**")
170
+ st.write(translated_queries.split("\n")[-1]) # Print only the updated question part
171
+
172
+ # Indexing
173
+ if st.button("Apply Indexing"):
174
+ st.write(f"**Applied Indexing Method:** {indexing_method}")
175
+ if pdf_file is not None:
176
+ # Extract and search PDF content
177
+ pdf_text_data = extract_text_from_pdf(pdf_file)
178
+ search_results = search_pdf_content(pdf_text_data, prompt)
179
+
180
+ if search_results:
181
+ st.write("**Relevant Content from PDF:**")
182
+ for result in search_results:
183
+ st.write(f"**Page {result['page']}, Line {result['line']}:** {result['content']}")
184
+
185
+ # Split text into chunks
186
+ chunks = split_text_into_chunks("\n".join([result["content"] for result in search_results]))
187
+ st.write("**Chunks Obtained from PDF:**")
188
+ for i, chunk in enumerate(chunks):
189
+ st.write(f"**Chunk {i + 1}:** {chunk}")
190
+
191
+ # Perform similarity search
192
+ if similarity_method == "Cosine Similarity":
193
+ st.write("**Cosine Similarity Results:**")
194
+ cosine_similarities = compute_cosine_similarity(prompt, chunks)
195
+ for i, similarity in enumerate(cosine_similarities):
196
+ st.write(f"**Chunk {i + 1} Similarity:** {similarity:.4f}")
197
+ elif similarity_method == "KNN":
198
+ st.write(f"**KNN Results (k={k_value}):**")
199
+ indices, distances = find_knn_similar_documents(prompt, chunks, k_value)
200
+ for i, (index, distance) in enumerate(zip(indices, distances)):
201
+ st.write(f"**Chunk {index + 1} Distance:** {distance:.4f}")
202
+ else:
203
+ st.write("**No relevant content found in the PDF.**")
204
+ else:
205
+ st.write("**No PDF uploaded.**")
206
+
207
+ # Generate Response
208
+ if st.button("Generate Response"):
209
+ if pdf_file is not None:
210
+ # Extract and search PDF content
211
+ pdf_text_data = extract_text_from_pdf(pdf_file)
212
+ search_results = search_pdf_content(pdf_text_data, prompt)
213
+
214
+ if search_results:
215
+ st.write("**Relevant Content from PDF:**")
216
+ for result in search_results:
217
+ st.write(f"**Page {result['page']}, Line {result['line']}:** \"{result['content']}\"")
218
+
219
+ # Generate response based on PDF content
220
+ pdf_context = "\n".join([result["content"] for result in search_results])
221
+ response = query_huggingface_model(f"Based on the following context:\n{pdf_context}\n\nAnswer this question: {prompt}", max_new_tokens, temperature, top_k)
222
+ else:
223
+ st.write("**No relevant content found in the PDF. Generating response without PDF context.**")
224
+ response = query_huggingface_model(prompt, max_new_tokens, temperature, top_k)
225
+ else:
226
+ st.write("**No PDF uploaded. Generating response without PDF context.**")
227
+ response = query_huggingface_model(prompt, max_new_tokens, temperature, top_k)
228
+
229
+ if response:
230
+ st.write("**Response:**", response)
231
+
232
+ if __name__ == "__main__":
233
+ main()