Spaces:
Sleeping
Sleeping
import streamlit as st | |
import os | |
from langdetect import detect | |
from PyPDF2 import PdfReader | |
import requests | |
from sentence_transformers import SentenceTransformer | |
import faiss | |
import numpy as np | |
# Load the API key from Streamlit secrets | |
API_KEY = st.secrets["Key2"] | |
API_URL = "https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-alpha" | |
# Load the embedding model for semantic search | |
embedding_model = SentenceTransformer('all-MiniLM-L6-v2') | |
# Function to query the LLM via Hugging Face Inference API | |
def query_llm_api(prompt, max_new_tokens=1000, temperature=0.7, top_k=50): | |
headers = { | |
"Authorization": f"Bearer {API_KEY}", | |
"Content-Type": "application/json", | |
} | |
payload = { | |
"inputs": prompt, | |
"parameters": { | |
"max_new_tokens": max_new_tokens, | |
"temperature": temperature, | |
"top_k": top_k, | |
}, | |
} | |
response = requests.post(API_URL, headers=headers, json=payload) | |
if response.status_code == 200: | |
return response.json()["generated_text"] | |
else: | |
st.error(f"Error querying the API: {response.status_code}, {response.text}") | |
return None | |
# Function to detect language | |
def detect_language(text): | |
try: | |
return detect(text) | |
except Exception: | |
return "en" # Default to English if detection fails | |
# Function to extract text from PDF with line and page numbers | |
def extract_text_from_pdf(pdf_file): | |
pdf_reader = PdfReader(pdf_file) | |
text_data = [] | |
for page_num, page in enumerate(pdf_reader.pages): | |
if page.extract_text(): | |
lines = page.extract_text().split('\n') | |
for line_num, line in enumerate(lines): | |
text_data.append({ | |
"page": page_num + 1, | |
"line": line_num + 1, | |
"content": line | |
}) | |
return text_data | |
# Function to create embeddings for the PDF text | |
def get_embeddings(text_data): | |
texts = [entry['content'] for entry in text_data] | |
return embedding_model.encode(texts, convert_to_tensor=False) | |
# Function to perform KNN or cosine similarity search | |
def search_pdf_content(pdf_text_data, query, search_type="knn", k=5): | |
query_embedding = embedding_model.encode([query])[0] | |
pdf_embeddings = get_embeddings(pdf_text_data) | |
if search_type == "knn": | |
index = faiss.IndexFlatL2(pdf_embeddings.shape[1]) | |
index.add(pdf_embeddings.astype('float32')) | |
distances, indices = index.search(np.array([query_embedding], dtype='float32'), k) | |
return [pdf_text_data[i] for i in indices[0]] | |
elif search_type == "cosine": | |
pdf_embeddings_norm = pdf_embeddings / np.linalg.norm(pdf_embeddings, axis=1, keepdims=True) | |
query_embedding_norm = query_embedding / np.linalg.norm(query_embedding) | |
similarities = np.dot(pdf_embeddings_norm, query_embedding_norm) | |
top_indices = np.argsort(similarities)[-k:][::-1] | |
return [pdf_text_data[i] for i in top_indices] | |
# Streamlit UI | |
st.title("PDF Search with LLM and Semantic Search") | |
pdf_file = st.file_uploader("Upload a PDF file", type="pdf") | |
search_query = st.text_input("Enter your search query") | |
search_method = st.radio("Select Search Method", ("knn", "cosine")) | |
k_value = st.slider("Number of Results (K)", min_value=1, max_value=20, value=5) | |
if pdf_file and search_query: | |
pdf_text_data = extract_text_from_pdf(pdf_file) | |
results = search_pdf_content(pdf_text_data, search_query, search_type=search_method, k=k_value) | |
st.write("### Search Results") | |
for res in results: | |
st.write(f"**Page {res['page']}, Line {res['line']}:** {res['content']}") | |