bainskarman commited on
Commit
a0f23a4
·
verified ·
1 Parent(s): dfccc9d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -175
app.py CHANGED
@@ -1,33 +1,45 @@
1
  import streamlit as st
2
  import os
3
- import requests
4
  from langdetect import detect
5
  from PyPDF2 import PdfReader
6
- import replicate # For interacting with Llama models hosted on Replicate
7
-
8
- # Load the Replicate API token from environment variables
9
- replicate_api_token = os.environ.get("Key2") # Replace with your Replicate API token
10
-
11
- # Function to query the Llama 3.2 7B Instruct model via Replicate
12
- def query_llama_model(prompt, max_new_tokens=1000, temperature=0.7, top_k=50):
13
- model_name = "meta/llama-3-7b-instruct" # Replace with the correct model name on Replicate
14
- input_data = {
15
- "prompt": prompt,
16
- "max_new_tokens": max_new_tokens,
17
- "temperature": temperature,
18
- "top_k": top_k,
 
 
 
 
19
  }
20
- response = replicate.run(
21
- model_name,
22
- input=input_data
23
- )
24
- return "".join(response) # Replicate returns a generator, so we join it into a single string
 
 
 
 
 
 
 
 
 
25
 
26
  # Function to detect language
27
  def detect_language(text):
28
  try:
29
  return detect(text)
30
- except:
31
  return "en" # Default to English if detection fails
32
 
33
  # Function to extract text from PDF with line and page numbers
@@ -35,160 +47,52 @@ def extract_text_from_pdf(pdf_file):
35
  pdf_reader = PdfReader(pdf_file)
36
  text_data = []
37
  for page_num, page in enumerate(pdf_reader.pages):
38
- lines = page.extract_text().split('\n')
39
- for line_num, line in enumerate(lines):
40
- text_data.append({
41
- "page": page_num + 1,
42
- "line": line_num + 1,
43
- "content": line
44
- })
 
45
  return text_data
46
 
47
- # Function to search for query in PDF content
48
- def search_pdf_content(pdf_text_data, query):
49
- results = []
50
- for entry in pdf_text_data:
51
- if query.lower() in entry["content"].lower():
52
- results.append(entry)
53
- return results
54
-
55
- # Function to split text into chunks
56
- def split_text_into_chunks(text, chunk_size=500):
57
- words = text.split()
58
- chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
59
- return chunks
60
-
61
- # Default system prompts for each query translation method
62
- DEFAULT_SYSTEM_PROMPTS = {
63
- "Multi-Query": """You are an AI language model assistant. Your task is to generate five
64
- different versions of the given user question to retrieve relevant documents from a vector
65
- database. By generating multiple perspectives on the user question, your goal is to help
66
- the user overcome some of the limitations of the distance-based similarity search.
67
- Provide these alternative questions separated by newlines. Original question: {question}""",
68
- "RAG Fusion": """You are an AI language model assistant. Your task is to combine multiple
69
- queries into a single, refined query to improve retrieval accuracy. Original question: {question}""",
70
- "Decomposition": """You are an AI language model assistant. Your task is to break down
71
- the given user question into simpler sub-questions. Provide these sub-questions separated
72
- by newlines. Original question: {question}""",
73
- "Step Back": """You are an AI language model assistant. Your task is to refine the given
74
- user question by taking a step back and asking a more general question. Original question: {question}""",
75
- "HyDE": """You are an AI language model assistant. Your task is to generate a hypothetical
76
- document that would be relevant to the given user question. Original question: {question}""",
77
- }
78
-
79
- # Streamlit App
80
- def main():
81
- st.title("RAG Model with Advanced Query Translation and Indexing")
82
- st.write("Enter a prompt and get a response from the model.")
83
-
84
- # Sidebar for options
85
- st.sidebar.title("Options")
86
-
87
- # PDF Upload
88
- st.sidebar.header("Upload PDF")
89
- pdf_file = st.sidebar.file_uploader("Upload a PDF file", type="pdf")
90
-
91
- # Query Translation Options
92
- st.sidebar.header("Query Translation")
93
- query_translation = st.sidebar.selectbox(
94
- "Select Query Translation Method",
95
- ["Multi-Query", "RAG Fusion", "Decomposition", "Step Back", "HyDE"]
96
- )
97
-
98
- # Indexing Options
99
- st.sidebar.header("Indexing")
100
- indexing_method = st.sidebar.selectbox(
101
- "Select Indexing Method",
102
- ["Multi-Representation", "Raptors", "ColBERT"]
103
- )
104
-
105
- # LLM Parameters
106
- st.sidebar.header("LLM Parameters")
107
- max_new_tokens = st.sidebar.slider("Max New Tokens", 10, 1000, 1000)
108
- temperature = st.sidebar.slider("Temperature", 0.1, 1.0, 0.7)
109
- top_k = st.sidebar.slider("Top K", 1, 100, 50)
110
-
111
- # System Prompt
112
- st.sidebar.header("System Prompt")
113
- default_system_prompt = DEFAULT_SYSTEM_PROMPTS[query_translation]
114
- system_prompt = st.sidebar.text_area("System Prompt", default_system_prompt)
115
-
116
- # Main Content
117
- st.header("Input Prompt")
118
- prompt = st.text_input("Enter your prompt:")
119
- if prompt:
120
- st.write("**Prompt:**", prompt)
121
-
122
- # Detect Language
123
- language = detect_language(prompt)
124
- st.write(f"**Detected Language:** {language}")
125
-
126
- # Query Translation
127
- if st.button("Apply Query Translation"):
128
- st.write(f"**Applied Query Translation Method:** {query_translation}")
129
- # Format the system prompt with the user's question
130
- formatted_prompt = system_prompt.format(question=prompt)
131
- st.write("**Formatted System Prompt:**", formatted_prompt)
132
-
133
- # Query the Llama model for query translation
134
- translated_queries = query_llama_model(formatted_prompt, max_new_tokens, temperature, top_k)
135
- if translated_queries:
136
- st.write("**Translated Queries:**")
137
- st.write(translated_queries.split("\n")[-1]) # Print only the updated question part
138
-
139
- # Indexing
140
- if st.button("Apply Indexing"):
141
- st.write(f"**Applied Indexing Method:** {indexing_method}")
142
- if pdf_file is not None:
143
- # Extract and search PDF content
144
- pdf_text_data = extract_text_from_pdf(pdf_file)
145
- search_results = search_pdf_content(pdf_text_data, prompt)
146
-
147
- if search_results:
148
- st.write("**Relevant Content from PDF:**")
149
- for result in search_results:
150
- st.write(f"**Page {result['page']}, Line {result['line']}:** {result['content']}")
151
-
152
- # Split text into chunks
153
- chunks = split_text_into_chunks("\n".join([result["content"] for result in search_results]))
154
- st.write("**Chunks Obtained from PDF:**")
155
- for i, chunk in enumerate(chunks):
156
- st.write(f"**Chunk {i + 1}:** {chunk}")
157
-
158
- # Print summary of split for Multi-Representation
159
- if indexing_method == "Multi-Representation":
160
- st.write("**Summary of Split:**")
161
- summary = query_llama_model(f"Summarize the following text:\n{chunks[0]}", max_new_tokens, temperature, top_k)
162
- st.write(summary)
163
- else:
164
- st.write("**No relevant content found in the PDF.**")
165
- else:
166
- st.write("**No PDF uploaded.**")
167
-
168
- # Generate Response
169
- if st.button("Generate Response"):
170
- if pdf_file is not None:
171
- # Extract and search PDF content
172
- pdf_text_data = extract_text_from_pdf(pdf_file)
173
- search_results = search_pdf_content(pdf_text_data, prompt)
174
-
175
- if search_results:
176
- st.write("**Relevant Content from PDF:**")
177
- for result in search_results:
178
- st.write(f"**Page {result['page']}, Line {result['line']}:** \"{result['content']}\"")
179
-
180
- # Generate response based on PDF content
181
- pdf_context = "\n".join([result["content"] for result in search_results])
182
- response = query_llama_model(f"Based on the following context:\n{pdf_context}\n\nAnswer this question: {prompt}", max_new_tokens, temperature, top_k)
183
- else:
184
- st.write("**No relevant content found in the PDF. Generating response without PDF context.**")
185
- response = query_llama_model(prompt, max_new_tokens, temperature, top_k)
186
- else:
187
- st.write("**No PDF uploaded. Generating response without PDF context.**")
188
- response = query_llama_model(prompt, max_new_tokens, temperature, top_k)
189
-
190
- if response:
191
- st.write("**Response:**", response)
192
-
193
- if __name__ == "__main__":
194
- main()
 
1
  import streamlit as st
2
  import os
 
3
  from langdetect import detect
4
  from PyPDF2 import PdfReader
5
+ import requests
6
+ from sentence_transformers import SentenceTransformer
7
+ import faiss
8
+ import numpy as np
9
+
10
+ # Load the API key from Streamlit secrets
11
+ API_KEY = st.secrets["Key2"]
12
+ API_URL = "https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-alpha"
13
+
14
+ # Load the embedding model for semantic search
15
+ embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
16
+
17
+ # Function to query the LLM via Hugging Face Inference API
18
+ def query_llm_api(prompt, max_new_tokens=1000, temperature=0.7, top_k=50):
19
+ headers = {
20
+ "Authorization": f"Bearer {API_KEY}",
21
+ "Content-Type": "application/json",
22
  }
23
+ payload = {
24
+ "inputs": prompt,
25
+ "parameters": {
26
+ "max_new_tokens": max_new_tokens,
27
+ "temperature": temperature,
28
+ "top_k": top_k,
29
+ },
30
+ }
31
+ response = requests.post(API_URL, headers=headers, json=payload)
32
+ if response.status_code == 200:
33
+ return response.json()["generated_text"]
34
+ else:
35
+ st.error(f"Error querying the API: {response.status_code}, {response.text}")
36
+ return None
37
 
38
  # Function to detect language
39
  def detect_language(text):
40
  try:
41
  return detect(text)
42
+ except Exception:
43
  return "en" # Default to English if detection fails
44
 
45
  # Function to extract text from PDF with line and page numbers
 
47
  pdf_reader = PdfReader(pdf_file)
48
  text_data = []
49
  for page_num, page in enumerate(pdf_reader.pages):
50
+ if page.extract_text():
51
+ lines = page.extract_text().split('\n')
52
+ for line_num, line in enumerate(lines):
53
+ text_data.append({
54
+ "page": page_num + 1,
55
+ "line": line_num + 1,
56
+ "content": line
57
+ })
58
  return text_data
59
 
60
+ # Function to create embeddings for the PDF text
61
+ def get_embeddings(text_data):
62
+ texts = [entry['content'] for entry in text_data]
63
+ return embedding_model.encode(texts, convert_to_tensor=False)
64
+
65
+ # Function to perform KNN or cosine similarity search
66
+ def search_pdf_content(pdf_text_data, query, search_type="knn", k=5):
67
+ query_embedding = embedding_model.encode([query])[0]
68
+ pdf_embeddings = get_embeddings(pdf_text_data)
69
+
70
+ if search_type == "knn":
71
+ index = faiss.IndexFlatL2(pdf_embeddings.shape[1])
72
+ index.add(pdf_embeddings.astype('float32'))
73
+ distances, indices = index.search(np.array([query_embedding], dtype='float32'), k)
74
+ return [pdf_text_data[i] for i in indices[0]]
75
+
76
+ elif search_type == "cosine":
77
+ pdf_embeddings_norm = pdf_embeddings / np.linalg.norm(pdf_embeddings, axis=1, keepdims=True)
78
+ query_embedding_norm = query_embedding / np.linalg.norm(query_embedding)
79
+ similarities = np.dot(pdf_embeddings_norm, query_embedding_norm)
80
+ top_indices = np.argsort(similarities)[-k:][::-1]
81
+ return [pdf_text_data[i] for i in top_indices]
82
+
83
+ # Streamlit UI
84
+ st.title("PDF Search with LLM and Semantic Search")
85
+
86
+ pdf_file = st.file_uploader("Upload a PDF file", type="pdf")
87
+ search_query = st.text_input("Enter your search query")
88
+
89
+ search_method = st.radio("Select Search Method", ("knn", "cosine"))
90
+ k_value = st.slider("Number of Results (K)", min_value=1, max_value=20, value=5)
91
+
92
+ if pdf_file and search_query:
93
+ pdf_text_data = extract_text_from_pdf(pdf_file)
94
+ results = search_pdf_content(pdf_text_data, search_query, search_type=search_method, k=k_value)
95
+
96
+ st.write("### Search Results")
97
+ for res in results:
98
+ st.write(f"**Page {res['page']}, Line {res['line']}:** {res['content']}")