Spaces:

bainskarman
/

AllAboutRAG

Sleeping

App Files Files Community

AllAboutRAG / app.py

bainskarman

Update app.py

a0f23a4 verified 2 months ago

raw

history blame

3.68 kB

	import streamlit as st
	import os
	from langdetect import detect
	from PyPDF2 import PdfReader
	import requests
	from sentence_transformers import SentenceTransformer
	import faiss
	import numpy as np

	# Load the API key from Streamlit secrets
	API_KEY = st.secrets["Key2"]
	API_URL = "https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-alpha"

	# Load the embedding model for semantic search
	embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

	# Function to query the LLM via Hugging Face Inference API
	def query_llm_api(prompt, max_new_tokens=1000, temperature=0.7, top_k=50):
	headers = {
	"Authorization": f"Bearer {API_KEY}",
	"Content-Type": "application/json",
	}
	payload = {
	"inputs": prompt,
	"parameters": {
	"max_new_tokens": max_new_tokens,
	"temperature": temperature,
	"top_k": top_k,
	},
	}
	response = requests.post(API_URL, headers=headers, json=payload)
	if response.status_code == 200:
	return response.json()["generated_text"]
	else:
	st.error(f"Error querying the API: {response.status_code}, {response.text}")
	return None

	# Function to detect language
	def detect_language(text):
	try:
	return detect(text)
	except Exception:
	return "en" # Default to English if detection fails

	# Function to extract text from PDF with line and page numbers
	def extract_text_from_pdf(pdf_file):
	pdf_reader = PdfReader(pdf_file)
	text_data = []
	for page_num, page in enumerate(pdf_reader.pages):
	if page.extract_text():
	lines = page.extract_text().split('\n')
	for line_num, line in enumerate(lines):
	text_data.append({
	"page": page_num + 1,
	"line": line_num + 1,
	"content": line
	})
	return text_data

	# Function to create embeddings for the PDF text
	def get_embeddings(text_data):
	texts = [entry['content'] for entry in text_data]
	return embedding_model.encode(texts, convert_to_tensor=False)

	# Function to perform KNN or cosine similarity search
	def search_pdf_content(pdf_text_data, query, search_type="knn", k=5):
	query_embedding = embedding_model.encode([query])[0]
	pdf_embeddings = get_embeddings(pdf_text_data)

	if search_type == "knn":
	index = faiss.IndexFlatL2(pdf_embeddings.shape[1])
	index.add(pdf_embeddings.astype('float32'))
	distances, indices = index.search(np.array([query_embedding], dtype='float32'), k)
	return [pdf_text_data[i] for i in indices[0]]

	elif search_type == "cosine":
	pdf_embeddings_norm = pdf_embeddings / np.linalg.norm(pdf_embeddings, axis=1, keepdims=True)
	query_embedding_norm = query_embedding / np.linalg.norm(query_embedding)
	similarities = np.dot(pdf_embeddings_norm, query_embedding_norm)
	top_indices = np.argsort(similarities)[-k:][::-1]
	return [pdf_text_data[i] for i in top_indices]

	# Streamlit UI
	st.title("PDF Search with LLM and Semantic Search")

	pdf_file = st.file_uploader("Upload a PDF file", type="pdf")
	search_query = st.text_input("Enter your search query")

	search_method = st.radio("Select Search Method", ("knn", "cosine"))
	k_value = st.slider("Number of Results (K)", min_value=1, max_value=20, value=5)

	if pdf_file and search_query:
	pdf_text_data = extract_text_from_pdf(pdf_file)
	results = search_pdf_content(pdf_text_data, search_query, search_type=search_method, k=k_value)

	st.write("### Search Results")
	for res in results:
	st.write(f"Page {res['page']}, Line {res['line']}: {res['content']}")