Spaces:
Sleeping
Sleeping
import streamlit as st | |
import os | |
import requests | |
from langdetect import detect | |
from PyPDF2 import PdfReader | |
# Load the Hugging Face token from environment variables (secrets) | |
token = os.environ.get("Key2") # Replace "KEY2" with your secret key name | |
# Function to query the Hugging Face API | |
def query_huggingface_api(prompt, max_new_tokens=1000, temperature=0.7, top_k=50): | |
model_name = "HuggingFaceH4/zephyr-7b-alpha" # Replace with your preferred model | |
api_url = f"https://api-inference.huggingface.co/models/{model_name}" | |
headers = {"Authorization": f"Bearer {token}"} | |
payload = { | |
"inputs": prompt, | |
"parameters": { | |
"max_new_tokens": max_new_tokens, | |
"temperature": temperature, | |
"top_k": top_k, | |
}, | |
} | |
response = requests.post(api_url, headers=headers, json=payload) | |
if response.status_code == 200: | |
return response.json()[0]["generated_text"] | |
else: | |
st.error(f"Error: {response.status_code} - {response.text}") | |
return None | |
# Function to detect language | |
def detect_language(text): | |
try: | |
return detect(text) | |
except: | |
return "en" # Default to English if detection fails | |
# Function to extract text from PDF with line and page numbers | |
def extract_text_from_pdf(pdf_file): | |
pdf_reader = PdfReader(pdf_file) | |
text_data = [] | |
for page_num, page in enumerate(pdf_reader.pages): | |
lines = page.extract_text().split('\n') | |
for line_num, line in enumerate(lines): | |
text_data.append({ | |
"page": page_num + 1, | |
"line": line_num + 1, | |
"content": line | |
}) | |
return text_data | |
# Default system prompts for each query translation method | |
DEFAULT_SYSTEM_PROMPTS = { | |
"Multi-Query": """You are an AI language model assistant. Your task is to generate five | |
different versions of the given user question to retrieve relevant documents from a vector | |
database. By generating multiple perspectives on the user question, your goal is to help | |
the user overcome some of the limitations of the distance-based similarity search. | |
Provide these alternative questions separated by newlines. Original question: {question}""", | |
"RAG Fusion": """You are an AI language model assistant. Your task is to combine multiple | |
queries into a single, refined query to improve retrieval accuracy. Original question: {question}""", | |
"Decomposition": """You are an AI language model assistant. Your task is to break down | |
the given user question into simpler sub-questions. Provide these sub-questions separated | |
by newlines. Original question: {question}""", | |
"Step Back": """You are an AI language model assistant. Your task is to refine the given | |
user question by taking a step back and asking a more general question. Original question: {question}""", | |
"HyDE": """You are an AI language model assistant. Your task is to generate a hypothetical | |
document that would be relevant to the given user question. Original question: {question}""", | |
} | |
# Streamlit App | |
def main(): | |
st.title("RAG Model with Advanced Query Translation and Indexing") | |
st.write("Enter a prompt and get a response from the model.") | |
# Sidebar for options | |
st.sidebar.title("Options") | |
# PDF Upload | |
st.sidebar.header("Upload PDF") | |
pdf_file = st.sidebar.file_uploader("Upload a PDF file", type="pdf") | |
# Query Translation Options | |
st.sidebar.header("Query Translation") | |
query_translation = st.sidebar.selectbox( | |
"Select Query Translation Method", | |
["Multi-Query", "RAG Fusion", "Decomposition", "Step Back", "HyDE"] | |
) | |
# Indexing Options | |
st.sidebar.header("Indexing") | |
indexing_method = st.sidebar.selectbox( | |
"Select Indexing Method", | |
["Multi-Representation", "Raptors", "ColBERT"] | |
) | |
# LLM Parameters | |
st.sidebar.header("LLM Parameters") | |
max_new_tokens = st.sidebar.slider("Max New Tokens", 10, 1000, 1000) | |
temperature = st.sidebar.slider("Temperature", 0.1, 1.0, 0.7) | |
top_k = st.sidebar.slider("Top K", 1, 100, 50) | |
# System Prompt | |
st.sidebar.header("System Prompt") | |
default_system_prompt = DEFAULT_SYSTEM_PROMPTS[query_translation] | |
system_prompt = st.sidebar.text_area("System Prompt", default_system_prompt) | |
# Main Content | |
st.header("Input Prompt") | |
prompt = st.text_input("Enter your prompt:") | |
if prompt: | |
st.write("**Prompt:**", prompt) | |
# Detect Language | |
language = detect_language(prompt) | |
st.write(f"**Detected Language:** {language}") | |
# Query Translation | |
if st.button("Apply Query Translation"): | |
st.write(f"**Applied Query Translation Method:** {query_translation}") | |
# Format the system prompt with the user's question | |
formatted_prompt = system_prompt.format(question=prompt) | |
st.write("**Formatted System Prompt:**", formatted_prompt) | |
# Query the Hugging Face API for query translation | |
translated_queries = query_huggingface_api(formatted_prompt, max_new_tokens, temperature, top_k) | |
if translated_queries: | |
st.write("**Translated Queries:**", translated_queries) | |
# Indexing | |
if st.button("Apply Indexing"): | |
st.write(f"**Applied Indexing Method:** {indexing_method}") | |
# Implement indexing logic here | |
# Example: Indexing with ColBERT | |
if indexing_method == "ColBERT": | |
st.write("Indexing with ColBERT...") | |
# Query the Hugging Face API for final response | |
if st.button("Generate Response"): | |
response = query_huggingface_api(prompt, max_new_tokens, temperature, top_k) | |
if response: | |
st.write("**Response:**", response) | |
# Process PDF content if uploaded | |
if pdf_file is not None: | |
pdf_text_data = extract_text_from_pdf(pdf_file) | |
if prompt: | |
# Search for relevant content in the PDF | |
for entry in pdf_text_data: | |
if prompt.lower() in entry["content"].lower(): | |
st.write(f"**Page {entry['page']}, Line {entry['line']}:** {entry['content']}") | |
if __name__ == "__main__": | |
main() |