Spaces:

bainskarman
/

AllAboutRAG

Sleeping

App Files Files Community

bainskarman commited on Mar 13

Commit

a0f23a4

verified ·

1 Parent(s): dfccc9d

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -175

app.py CHANGED Viewed

@@ -1,33 +1,45 @@
 import streamlit as st
 import os
-import requests
 from langdetect import detect
 from PyPDF2 import PdfReader
-import replicate  # For interacting with Llama models hosted on Replicate
-# Load the Replicate API token from environment variables
-replicate_api_token = os.environ.get("Key2")  # Replace with your Replicate API token
-# Function to query the Llama 3.2 7B Instruct model via Replicate
-def query_llama_model(prompt, max_new_tokens=1000, temperature=0.7, top_k=50):
-    model_name = "meta/llama-3-7b-instruct"  # Replace with the correct model name on Replicate
-    input_data = {
-        "prompt": prompt,
-        "max_new_tokens": max_new_tokens,
-        "temperature": temperature,
-        "top_k": top_k,
     }
-    response = replicate.run(
-        model_name,
-        input=input_data
-    )
-    return "".join(response)  # Replicate returns a generator, so we join it into a single string
 # Function to detect language
 def detect_language(text):
     try:
         return detect(text)
-    except:
         return "en"  # Default to English if detection fails
 # Function to extract text from PDF with line and page numbers
@@ -35,160 +47,52 @@ def extract_text_from_pdf(pdf_file):
     pdf_reader = PdfReader(pdf_file)
     text_data = []
     for page_num, page in enumerate(pdf_reader.pages):
-        lines = page.extract_text().split('\n')
-        for line_num, line in enumerate(lines):
-            text_data.append({
-                "page": page_num + 1,
-                "line": line_num + 1,
-                "content": line
-            })
     return text_data
-# Function to search for query in PDF content
-def search_pdf_content(pdf_text_data, query):
-    results = []
-    for entry in pdf_text_data:
-        if query.lower() in entry["content"].lower():
-            results.append(entry)
-    return results
-# Function to split text into chunks
-def split_text_into_chunks(text, chunk_size=500):
-    words = text.split()
-    chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
-    return chunks
-# Default system prompts for each query translation method
-DEFAULT_SYSTEM_PROMPTS = {
-    "Multi-Query": """You are an AI language model assistant. Your task is to generate five
-different versions of the given user question to retrieve relevant documents from a vector
-database. By generating multiple perspectives on the user question, your goal is to help
-the user overcome some of the limitations of the distance-based similarity search.
-Provide these alternative questions separated by newlines. Original question: {question}""",
-    "RAG Fusion": """You are an AI language model assistant. Your task is to combine multiple
-queries into a single, refined query to improve retrieval accuracy. Original question: {question}""",
-    "Decomposition": """You are an AI language model assistant. Your task is to break down
-the given user question into simpler sub-questions. Provide these sub-questions separated
-by newlines. Original question: {question}""",
-    "Step Back": """You are an AI language model assistant. Your task is to refine the given
-user question by taking a step back and asking a more general question. Original question: {question}""",
-    "HyDE": """You are an AI language model assistant. Your task is to generate a hypothetical
-document that would be relevant to the given user question. Original question: {question}""",
-}
-# Streamlit App
-def main():
-    st.title("RAG Model with Advanced Query Translation and Indexing")
-    st.write("Enter a prompt and get a response from the model.")
-    # Sidebar for options
-    st.sidebar.title("Options")
-    # PDF Upload
-    st.sidebar.header("Upload PDF")
-    pdf_file = st.sidebar.file_uploader("Upload a PDF file", type="pdf")
-    # Query Translation Options
-    st.sidebar.header("Query Translation")
-    query_translation = st.sidebar.selectbox(
-        "Select Query Translation Method",
-        ["Multi-Query", "RAG Fusion", "Decomposition", "Step Back", "HyDE"]
-    )
-    # Indexing Options
-    st.sidebar.header("Indexing")
-    indexing_method = st.sidebar.selectbox(
-        "Select Indexing Method",
-        ["Multi-Representation", "Raptors", "ColBERT"]
-    )
-    # LLM Parameters
-    st.sidebar.header("LLM Parameters")
-    max_new_tokens = st.sidebar.slider("Max New Tokens", 10, 1000, 1000)
-    temperature = st.sidebar.slider("Temperature", 0.1, 1.0, 0.7)
-    top_k = st.sidebar.slider("Top K", 1, 100, 50)
-    # System Prompt
-    st.sidebar.header("System Prompt")
-    default_system_prompt = DEFAULT_SYSTEM_PROMPTS[query_translation]
-    system_prompt = st.sidebar.text_area("System Prompt", default_system_prompt)
-    # Main Content
-    st.header("Input Prompt")
-    prompt = st.text_input("Enter your prompt:")
-    if prompt:
-        st.write("**Prompt:**", prompt)
-        # Detect Language
-        language = detect_language(prompt)
-        st.write(f"**Detected Language:** {language}")
-        # Query Translation
-        if st.button("Apply Query Translation"):
-            st.write(f"**Applied Query Translation Method:** {query_translation}")
-            # Format the system prompt with the user's question
-            formatted_prompt = system_prompt.format(question=prompt)
-            st.write("**Formatted System Prompt:**", formatted_prompt)
-            # Query the Llama model for query translation
-            translated_queries = query_llama_model(formatted_prompt, max_new_tokens, temperature, top_k)
-            if translated_queries:
-                st.write("**Translated Queries:**")
-                st.write(translated_queries.split("\n")[-1])  # Print only the updated question part
-        # Indexing
-        if st.button("Apply Indexing"):
-            st.write(f"**Applied Indexing Method:** {indexing_method}")
-            if pdf_file is not None:
-                # Extract and search PDF content
-                pdf_text_data = extract_text_from_pdf(pdf_file)
-                search_results = search_pdf_content(pdf_text_data, prompt)
-                if search_results:
-                    st.write("**Relevant Content from PDF:**")
-                    for result in search_results:
-                        st.write(f"**Page {result['page']}, Line {result['line']}:** {result['content']}")
-                    # Split text into chunks
-                    chunks = split_text_into_chunks("\n".join([result["content"] for result in search_results]))
-                    st.write("**Chunks Obtained from PDF:**")
-                    for i, chunk in enumerate(chunks):
-                        st.write(f"**Chunk {i + 1}:** {chunk}")
-                    # Print summary of split for Multi-Representation
-                    if indexing_method == "Multi-Representation":
-                        st.write("**Summary of Split:**")
-                        summary = query_llama_model(f"Summarize the following text:\n{chunks[0]}", max_new_tokens, temperature, top_k)
-                        st.write(summary)
-                else:
-                    st.write("**No relevant content found in the PDF.**")
-            else:
-                st.write("**No PDF uploaded.**")
-        # Generate Response
-        if st.button("Generate Response"):
-            if pdf_file is not None:
-                # Extract and search PDF content
-                pdf_text_data = extract_text_from_pdf(pdf_file)
-                search_results = search_pdf_content(pdf_text_data, prompt)
-                if search_results:
-                    st.write("**Relevant Content from PDF:**")
-                    for result in search_results:
-                        st.write(f"**Page {result['page']}, Line {result['line']}:** \"{result['content']}\"")
-                    # Generate response based on PDF content
-                    pdf_context = "\n".join([result["content"] for result in search_results])
-                    response = query_llama_model(f"Based on the following context:\n{pdf_context}\n\nAnswer this question: {prompt}", max_new_tokens, temperature, top_k)
-                else:
-                    st.write("**No relevant content found in the PDF. Generating response without PDF context.**")
-                    response = query_llama_model(prompt, max_new_tokens, temperature, top_k)
-            else:
-                st.write("**No PDF uploaded. Generating response without PDF context.**")
-                response = query_llama_model(prompt, max_new_tokens, temperature, top_k)
-            if response:
-                st.write("**Response:**", response)
-if __name__ == "__main__":
-    main()

 import streamlit as st
 import os
 from langdetect import detect
 from PyPDF2 import PdfReader
+import requests
+from sentence_transformers import SentenceTransformer
+import faiss
+import numpy as np
+# Load the API key from Streamlit secrets
+API_KEY = st.secrets["Key2"]
+API_URL = "https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-alpha"
+# Load the embedding model for semantic search
+embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+# Function to query the LLM via Hugging Face Inference API
+def query_llm_api(prompt, max_new_tokens=1000, temperature=0.7, top_k=50):
+    headers = {
+        "Authorization": f"Bearer {API_KEY}",
+        "Content-Type": "application/json",
     }
+    payload = {
+        "inputs": prompt,
+        "parameters": {
+            "max_new_tokens": max_new_tokens,
+            "temperature": temperature,
+            "top_k": top_k,
+        },
+    }
+    response = requests.post(API_URL, headers=headers, json=payload)
+    if response.status_code == 200:
+        return response.json()["generated_text"]
+    else:
+        st.error(f"Error querying the API: {response.status_code}, {response.text}")
+        return None
 # Function to detect language
 def detect_language(text):
     try:
         return detect(text)
+    except Exception:
         return "en"  # Default to English if detection fails
 # Function to extract text from PDF with line and page numbers
     pdf_reader = PdfReader(pdf_file)
     text_data = []
     for page_num, page in enumerate(pdf_reader.pages):
+        if page.extract_text():
+            lines = page.extract_text().split('\n')
+            for line_num, line in enumerate(lines):
+                text_data.append({
+                    "page": page_num + 1,
+                    "line": line_num + 1,
+                    "content": line
+                })
     return text_data
+# Function to create embeddings for the PDF text
+def get_embeddings(text_data):
+    texts = [entry['content'] for entry in text_data]
+    return embedding_model.encode(texts, convert_to_tensor=False)
+# Function to perform KNN or cosine similarity search
+def search_pdf_content(pdf_text_data, query, search_type="knn", k=5):
+    query_embedding = embedding_model.encode([query])[0]
+    pdf_embeddings = get_embeddings(pdf_text_data)
+    if search_type == "knn":
+        index = faiss.IndexFlatL2(pdf_embeddings.shape[1])
+        index.add(pdf_embeddings.astype('float32'))
+        distances, indices = index.search(np.array([query_embedding], dtype='float32'), k)
+        return [pdf_text_data[i] for i in indices[0]]
+    elif search_type == "cosine":
+        pdf_embeddings_norm = pdf_embeddings / np.linalg.norm(pdf_embeddings, axis=1, keepdims=True)
+        query_embedding_norm = query_embedding / np.linalg.norm(query_embedding)
+        similarities = np.dot(pdf_embeddings_norm, query_embedding_norm)
+        top_indices = np.argsort(similarities)[-k:][::-1]
+        return [pdf_text_data[i] for i in top_indices]
+# Streamlit UI
+st.title("PDF Search with LLM and Semantic Search")
+pdf_file = st.file_uploader("Upload a PDF file", type="pdf")
+search_query = st.text_input("Enter your search query")
+search_method = st.radio("Select Search Method", ("knn", "cosine"))
+k_value = st.slider("Number of Results (K)", min_value=1, max_value=20, value=5)
+if pdf_file and search_query:
+    pdf_text_data = extract_text_from_pdf(pdf_file)
+    results = search_pdf_content(pdf_text_data, search_query, search_type=search_method, k=k_value)
+    st.write("### Search Results")
+    for res in results:
+        st.write(f"**Page {res['page']}, Line {res['line']}:** {res['content']}")