bainskarman commited on
Commit
4f13fd4
·
verified ·
1 Parent(s): 5c9c8d3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +121 -194
app.py CHANGED
@@ -1,19 +1,56 @@
1
  import streamlit as st
2
  import os
3
  import requests
 
4
  from langdetect import detect
5
  from PyPDF2 import PdfReader
6
  from sklearn.feature_extraction.text import TfidfVectorizer
7
  from sklearn.metrics.pairwise import cosine_similarity
8
  from sklearn.neighbors import NearestNeighbors
9
  import numpy as np
 
 
 
10
 
11
  # Load the Hugging Face token from environment variables
12
- huggingface_token = os.environ.get("Key2") # Replace with your Hugging Face token
13
 
14
- # Function to query the Hugging Face API
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  def query_huggingface_model(prompt, max_new_tokens=1000, temperature=0.7, top_k=50):
16
- model_name = "HuggingFaceH4/zephyr-7b-alpha" # Replace with your preferred model
17
  api_url = f"https://api-inference.huggingface.co/models/{model_name}"
18
  headers = {"Authorization": f"Bearer {huggingface_token}"}
19
  payload = {
@@ -24,206 +61,96 @@ def query_huggingface_model(prompt, max_new_tokens=1000, temperature=0.7, top_k=
24
  "top_k": top_k,
25
  },
26
  }
27
- response = requests.post(api_url, headers=headers, json=payload)
28
- if response.status_code == 200:
29
- return response.json()[0]["generated_text"]
30
- else:
31
- st.error(f"Error: {response.status_code} - {response.text}")
32
- return None
33
-
34
- # Function to detect language
35
- def detect_language(text):
36
  try:
37
- return detect(text)
38
- except:
39
- return "en" # Default to English if detection fails
40
-
41
- # Function to extract text from PDF with line and page numbers
42
- def extract_text_from_pdf(pdf_file):
43
- pdf_reader = PdfReader(pdf_file)
44
- text_data = []
45
- for page_num, page in enumerate(pdf_reader.pages):
46
- lines = page.extract_text().split('\n')
47
- for line_num, line in enumerate(lines):
48
- text_data.append({
49
- "page": page_num + 1,
50
- "line": line_num + 1,
51
- "content": line
52
- })
53
- return text_data
54
-
55
- # Function to search for query in PDF content
56
- def search_pdf_content(pdf_text_data, query):
57
- results = []
58
- for entry in pdf_text_data:
59
- if query.lower() in entry["content"].lower():
60
- results.append(entry)
61
- return results
62
-
63
- # Function to split text into chunks
64
- def split_text_into_chunks(text, chunk_size=500):
65
- words = text.split()
66
- chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
67
- return chunks
68
-
69
- # Function to compute cosine similarity between query and document chunks
70
- def compute_cosine_similarity(query, chunks):
71
- vectorizer = TfidfVectorizer()
72
- tfidf_matrix = vectorizer.fit_transform([query] + chunks)
73
- cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
74
- return cosine_similarities
75
-
76
- # Function to find KNN-based similar documents
77
- def find_knn_similar_documents(query, chunks, k=5):
78
- vectorizer = TfidfVectorizer()
79
- tfidf_matrix = vectorizer.fit_transform([query] + chunks)
80
- knn = NearestNeighbors(n_neighbors=k, metric="cosine")
81
- knn.fit(tfidf_matrix[1:])
82
- distances, indices = knn.kneighbors(tfidf_matrix[0:1])
83
- return indices.flatten(), distances.flatten()
84
 
85
- # Default system prompts for each query translation method
86
- DEFAULT_SYSTEM_PROMPTS = {
87
- "Multi-Query": """You are an AI language model assistant. Your task is to generate five
88
- different versions of the given user question to retrieve relevant documents from a vector
89
- database. By generating multiple perspectives on the user question, your goal is to help
90
- the user overcome some of the limitations of the distance-based similarity search.
91
- Provide these alternative questions separated by newlines. Original question: {question}""",
92
- "RAG Fusion": """You are an AI language model assistant. Your task is to combine multiple
93
- queries into a single, refined query to improve retrieval accuracy. Original question: {question}""",
94
- "Decomposition": """You are an AI language model assistant. Your task is to break down
95
- the given user question into simpler sub-questions. Provide these sub-questions separated
96
- by newlines. Original question: {question}""",
97
- "Step Back": """You are an AI language model assistant. Your task is to refine the given
98
- user question by taking a step back and asking a more general question. Original question: {question}""",
99
- "HyDE": """You are an AI language model assistant. Your task is to generate a hypothetical
100
- document that would be relevant to the given user question. Original question: {question}""",
101
- }
 
 
 
 
 
102
 
103
  # Streamlit App
104
  def main():
105
- st.title("RAG Model with Advanced Query Translation and Indexing")
106
- st.write("Enter a prompt and get a response from the model.")
107
-
108
- # Sidebar for options
109
- st.sidebar.title("Options")
110
-
111
- # PDF Upload
112
- st.sidebar.header("Upload PDF")
113
- pdf_file = st.sidebar.file_uploader("Upload a PDF file", type="pdf")
114
-
115
- # Query Translation Options
116
- st.sidebar.header("Query Translation")
117
- query_translation = st.sidebar.selectbox(
118
- "Select Query Translation Method",
119
- ["Multi-Query", "RAG Fusion", "Decomposition", "Step Back", "HyDE"]
120
- )
121
-
122
- # Indexing Options
123
- st.sidebar.header("Indexing")
124
- indexing_method = st.sidebar.selectbox(
125
- "Select Indexing Method",
126
- ["Multi-Representation", "Raptors", "ColBERT"]
127
- )
128
-
129
- # Similarity Search Options
130
- st.sidebar.header("Similarity Search")
131
- similarity_method = st.sidebar.selectbox(
132
- "Select Similarity Search Method",
133
- ["Cosine Similarity", "KNN"]
134
- )
135
- if similarity_method == "KNN":
136
- k_value = st.sidebar.slider("Select K Value", 1, 10, 5)
137
-
138
- # LLM Parameters
139
- st.sidebar.header("LLM Parameters")
140
- max_new_tokens = st.sidebar.slider("Max New Tokens", 10, 1000, 1000)
141
- temperature = st.sidebar.slider("Temperature", 0.1, 1.0, 0.7)
142
- top_k = st.sidebar.slider("Top K", 1, 100, 50)
143
-
144
- # System Prompt
145
- st.sidebar.header("System Prompt")
146
- default_system_prompt = DEFAULT_SYSTEM_PROMPTS[query_translation]
147
- system_prompt = st.sidebar.text_area("System Prompt", default_system_prompt)
148
-
149
- # Main Content
150
- st.header("Input Prompt")
151
- prompt = st.text_input("Enter your prompt:")
152
  if prompt:
153
- st.write("**Prompt:**", prompt)
154
-
155
- # Detect Language
156
- language = detect_language(prompt)
157
- st.write(f"**Detected Language:** {language}")
158
-
159
- # Query Translation
160
- if st.button("Apply Query Translation"):
161
- # Format the system prompt with the user's question
162
- formatted_prompt = system_prompt.format(question=prompt)
163
- st.write("**Formatted System Prompt:**", formatted_prompt)
164
-
165
- # Query the Hugging Face model for query translation
166
- translated_queries = query_huggingface_model(formatted_prompt, max_new_tokens, temperature, top_k)
167
- if translated_queries:
168
- st.write("**Translated Queries:**")
169
- st.write(translated_queries.split("\n")[-1]) # Print only the updated question part
170
-
171
- # Indexing
172
- if st.button("Apply Indexing"):
173
- st.write(f"**Applied Indexing Method:** {indexing_method}")
174
- if pdf_file is not None:
175
- # Extract and search PDF content
176
- pdf_text_data = extract_text_from_pdf(pdf_file)
177
- search_results = search_pdf_content(pdf_text_data, prompt)
178
-
179
- if search_results:
180
- st.write("**Relevant Content from PDF:**")
181
- for result in search_results:
182
- st.write(f"**Page {result['page']}, Line {result['line']}:** {result['content']}")
183
-
184
- # Split text into chunks
185
- chunks = split_text_into_chunks("\n".join([result["content"] for result in search_results]))
186
- st.write("**Chunks Obtained from PDF:**")
187
- for i, chunk in enumerate(chunks):
188
- st.write(f"**Chunk {i + 1}:** {chunk}")
189
-
190
- # Perform similarity search
191
- if similarity_method == "Cosine Similarity":
192
- st.write("**Cosine Similarity Results:**")
193
- cosine_similarities = compute_cosine_similarity(prompt, chunks)
194
- for i, similarity in enumerate(cosine_similarities):
195
- st.write(f"**Chunk {i + 1} Similarity:** {similarity:.4f}")
196
- elif similarity_method == "KNN":
197
- st.write(f"**KNN Results (k={k_value}):**")
198
- indices, distances = find_knn_similar_documents(prompt, chunks, k_value)
199
- for i, (index, distance) in enumerate(zip(indices, distances)):
200
- st.write(f"**Chunk {index + 1} Distance:** {distance:.4f}")
201
  else:
202
- st.write("**No relevant content found in the PDF.**")
203
- else:
204
- st.write("**No PDF uploaded.**")
205
-
206
- # Generate Response
207
- if st.button("Generate Response"):
208
- if pdf_file is not None:
209
- # Extract and search PDF content
210
- pdf_text_data = extract_text_from_pdf(pdf_file)
211
- search_results = search_pdf_content(pdf_text_data, prompt)
212
-
213
- if search_results:
214
- st.write("**Relevant Content from PDF:**")
215
- for result in search_results:
216
- st.write(f"**Page {result['page']}, Line {result['line']}:** \"{result['content']}\"")
217
-
218
- # Generate response based on PDF content
219
- pdf_context = "\n".join([result["content"] for result in search_results])
220
- response = query_huggingface_model(f"Based on the following context:\n{pdf_context}\n\nAnswer this question: {prompt}", max_new_tokens, temperature, top_k)
221
- if response:
222
- st.write("**Response:**", response)
223
  else:
224
- st.write("**No relevant content found in the PDF. No response generated.**")
225
  else:
226
- st.write("**No PDF uploaded. No response generated.**")
227
 
228
  if __name__ == "__main__":
229
  main()
 
1
  import streamlit as st
2
  import os
3
  import requests
4
+ import re
5
  from langdetect import detect
6
  from PyPDF2 import PdfReader
7
  from sklearn.feature_extraction.text import TfidfVectorizer
8
  from sklearn.metrics.pairwise import cosine_similarity
9
  from sklearn.neighbors import NearestNeighbors
10
  import numpy as np
11
+ from sentence_transformers import SentenceTransformer
12
+ import faiss
13
+ import hashlib
14
 
15
  # Load the Hugging Face token from environment variables
16
+ huggingface_token = os.environ.get("Key2")
17
 
18
+ # Initialize Sentence Transformer model for better embeddings
19
+ sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
20
+
21
+ # Cache PDF extraction
22
+ @st.cache_data
23
+ def extract_text_from_pdf(pdf_file):
24
+ pdf_reader = PdfReader(pdf_file)
25
+ text_data = []
26
+ for page_num, page in enumerate(pdf_reader.pages):
27
+ text = page.extract_text()
28
+ text = re.sub(r'\s+', ' ', text) # Clean extra whitespace
29
+ text_data.append({
30
+ "page": page_num + 1,
31
+ "content": text
32
+ })
33
+ return text_data
34
+
35
+ # Enhanced text chunking with overlap
36
+ def split_text_into_chunks(text, chunk_size=500, overlap=100):
37
+ words = text.split()
38
+ chunks = []
39
+ for i in range(0, len(words), chunk_size - overlap):
40
+ chunks.append(" ".join(words[i:i + chunk_size]))
41
+ return chunks
42
+
43
+ # Enhanced semantic search using sentence transformers
44
+ def semantic_search(query, chunks, threshold=0.3):
45
+ query_embedding = sentence_model.encode([query])
46
+ chunk_embeddings = sentence_model.encode(chunks)
47
+ similarities = cosine_similarity(query_embedding, chunk_embeddings)[0]
48
+ results = [(chunks[i], similarities[i]) for i in np.argsort(similarities)[::-1]]
49
+ return [res for res in results if res[1] > threshold]
50
+
51
+ # Improved query translation with error handling
52
  def query_huggingface_model(prompt, max_new_tokens=1000, temperature=0.7, top_k=50):
53
+ model_name = "HuggingFaceH4/zephyr-7b-alpha"
54
  api_url = f"https://api-inference.huggingface.co/models/{model_name}"
55
  headers = {"Authorization": f"Bearer {huggingface_token}"}
56
  payload = {
 
61
  "top_k": top_k,
62
  },
63
  }
 
 
 
 
 
 
 
 
 
64
  try:
65
+ response = requests.post(api_url, headers=headers, json=payload, timeout=30)
66
+ if response.status_code == 200:
67
+ return response.json()[0]["generated_text"]
68
+ else:
69
+ st.error(f"API Error: {response.status_code}")
70
+ return None
71
+ except Exception as e:
72
+ st.error(f"Connection Error: {str(e)}")
73
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
+ # Enhanced indexing strategies
76
+ def create_index(text_chunks, method="Multi-Representation"):
77
+ if method == "Multi-Representation":
78
+ return TfidfVectorizer().fit_transform(text_chunks)
79
+ elif method == "Raptors":
80
+ embeddings = sentence_model.encode(text_chunks)
81
+ index = faiss.IndexFlatL2(embeddings.shape[1])
82
+ index.add(embeddings)
83
+ return index
84
+ elif method == "ColBERT":
85
+ return sentence_model.encode(text_chunks)
86
+
87
+ # Improved similarity search with multiple methods
88
+ def similarity_search(query, chunks, method="Cosine", index=None, k=5):
89
+ if method == "Cosine":
90
+ return semantic_search(query, chunks)
91
+ elif method == "KNN":
92
+ if isinstance(index, faiss.IndexFlatL2):
93
+ query_embedding = sentence_model.encode([query])
94
+ distances, indices = index.search(query_embedding, k)
95
+ return [(chunks[i], 1 - distances[0][j]) for j, i in enumerate(indices[0])]
96
+ return []
97
 
98
  # Streamlit App
99
  def main():
100
+ st.title("Enhanced RAG Model with Advanced Features")
101
+
102
+ # Sidebar configurations
103
+ st.sidebar.title("Configuration")
104
+ pdf_file = st.sidebar.file_uploader("Upload PDF", type="pdf")
105
+ query_translation = st.sidebar.selectbox("Query Translation", list(DEFAULT_SYSTEM_PROMPTS.keys()))
106
+ indexing_method = st.sidebar.selectbox("Indexing Method", ["Multi-Representation", "Raptors", "ColBERT"])
107
+ similarity_method = st.sidebar.selectbox("Similarity Search", ["Cosine", "KNN"])
108
+ similarity_threshold = st.sidebar.slider("Similarity Threshold", 0.0, 1.0, 0.3)
109
+
110
+ # Main interface
111
+ prompt = st.text_input("Enter your query:")
112
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  if prompt:
114
+ with st.spinner("Processing..."):
115
+ # Query Translation
116
+ translated_prompt = query_huggingface_model(
117
+ DEFAULT_SYSTEM_PROMPTS[query_translation].format(question=prompt)
118
+ )
119
+
120
+ if pdf_file:
121
+ # Process PDF
122
+ text_data = extract_text_from_pdf(pdf_file)
123
+ full_text = " ".join([p["content"] for p in text_data])
124
+ chunks = split_text_into_chunks(full_text)
125
+
126
+ # Create index
127
+ index = create_index(chunks, indexing_method)
128
+
129
+ # Perform search
130
+ if query_translation == "HyDE":
131
+ hypothetical_answer = translated_prompt
132
+ results = semantic_search(hypothetical_answer, chunks, similarity_threshold)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  else:
134
+ results = similarity_search(prompt, chunks, similarity_method, index)
135
+
136
+ # Display results
137
+ if results:
138
+ st.subheader("Top Results:")
139
+ for i, (chunk, score) in enumerate(results[:3]):
140
+ st.markdown(f"**Result {i+1}** (Score: {score:.2f}):")
141
+ st.write(chunk)
142
+
143
+ # Generate response
144
+ context = "\n".join([chunk for chunk, _ in results[:3]])
145
+ response = query_huggingface_model(
146
+ f"Context: {context}\n\nQuestion: {prompt}\n\nAnswer:"
147
+ )
148
+ st.subheader("Generated Response:")
149
+ st.write(response)
 
 
 
 
 
150
  else:
151
+ st.warning("No relevant documents found matching the query.")
152
  else:
153
+ st.error("Please upload a PDF document first.")
154
 
155
  if __name__ == "__main__":
156
  main()