AllAboutRAG / app.py
bainskarman's picture
Update app.py
a0f23a4 verified
raw
history blame
3.68 kB
import streamlit as st
import os
from langdetect import detect
from PyPDF2 import PdfReader
import requests
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
# Load the API key from Streamlit secrets
API_KEY = st.secrets["Key2"]
API_URL = "https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-alpha"
# Load the embedding model for semantic search
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# Function to query the LLM via Hugging Face Inference API
def query_llm_api(prompt, max_new_tokens=1000, temperature=0.7, top_k=50):
headers = {
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json",
}
payload = {
"inputs": prompt,
"parameters": {
"max_new_tokens": max_new_tokens,
"temperature": temperature,
"top_k": top_k,
},
}
response = requests.post(API_URL, headers=headers, json=payload)
if response.status_code == 200:
return response.json()["generated_text"]
else:
st.error(f"Error querying the API: {response.status_code}, {response.text}")
return None
# Function to detect language
def detect_language(text):
try:
return detect(text)
except Exception:
return "en" # Default to English if detection fails
# Function to extract text from PDF with line and page numbers
def extract_text_from_pdf(pdf_file):
pdf_reader = PdfReader(pdf_file)
text_data = []
for page_num, page in enumerate(pdf_reader.pages):
if page.extract_text():
lines = page.extract_text().split('\n')
for line_num, line in enumerate(lines):
text_data.append({
"page": page_num + 1,
"line": line_num + 1,
"content": line
})
return text_data
# Function to create embeddings for the PDF text
def get_embeddings(text_data):
texts = [entry['content'] for entry in text_data]
return embedding_model.encode(texts, convert_to_tensor=False)
# Function to perform KNN or cosine similarity search
def search_pdf_content(pdf_text_data, query, search_type="knn", k=5):
query_embedding = embedding_model.encode([query])[0]
pdf_embeddings = get_embeddings(pdf_text_data)
if search_type == "knn":
index = faiss.IndexFlatL2(pdf_embeddings.shape[1])
index.add(pdf_embeddings.astype('float32'))
distances, indices = index.search(np.array([query_embedding], dtype='float32'), k)
return [pdf_text_data[i] for i in indices[0]]
elif search_type == "cosine":
pdf_embeddings_norm = pdf_embeddings / np.linalg.norm(pdf_embeddings, axis=1, keepdims=True)
query_embedding_norm = query_embedding / np.linalg.norm(query_embedding)
similarities = np.dot(pdf_embeddings_norm, query_embedding_norm)
top_indices = np.argsort(similarities)[-k:][::-1]
return [pdf_text_data[i] for i in top_indices]
# Streamlit UI
st.title("PDF Search with LLM and Semantic Search")
pdf_file = st.file_uploader("Upload a PDF file", type="pdf")
search_query = st.text_input("Enter your search query")
search_method = st.radio("Select Search Method", ("knn", "cosine"))
k_value = st.slider("Number of Results (K)", min_value=1, max_value=20, value=5)
if pdf_file and search_query:
pdf_text_data = extract_text_from_pdf(pdf_file)
results = search_pdf_content(pdf_text_data, search_query, search_type=search_method, k=k_value)
st.write("### Search Results")
for res in results:
st.write(f"**Page {res['page']}, Line {res['line']}:** {res['content']}")