Spaces:
Sleeping
Sleeping
File size: 3,676 Bytes
7b666bb c0a164f d2c0564 5e06280 a0f23a4 15f5963 a0f23a4 7b666bb d2c0564 a0f23a4 d2c0564 a1fd273 5e06280 a1fd273 a0f23a4 a1fd273 5e06280 a0f23a4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import streamlit as st
import os
from langdetect import detect
from PyPDF2 import PdfReader
import requests
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
# Load the API key from Streamlit secrets
API_KEY = st.secrets["Key2"]
API_URL = "https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-alpha"
# Load the embedding model for semantic search
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# Function to query the LLM via Hugging Face Inference API
def query_llm_api(prompt, max_new_tokens=1000, temperature=0.7, top_k=50):
headers = {
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json",
}
payload = {
"inputs": prompt,
"parameters": {
"max_new_tokens": max_new_tokens,
"temperature": temperature,
"top_k": top_k,
},
}
response = requests.post(API_URL, headers=headers, json=payload)
if response.status_code == 200:
return response.json()["generated_text"]
else:
st.error(f"Error querying the API: {response.status_code}, {response.text}")
return None
# Function to detect language
def detect_language(text):
try:
return detect(text)
except Exception:
return "en" # Default to English if detection fails
# Function to extract text from PDF with line and page numbers
def extract_text_from_pdf(pdf_file):
pdf_reader = PdfReader(pdf_file)
text_data = []
for page_num, page in enumerate(pdf_reader.pages):
if page.extract_text():
lines = page.extract_text().split('\n')
for line_num, line in enumerate(lines):
text_data.append({
"page": page_num + 1,
"line": line_num + 1,
"content": line
})
return text_data
# Function to create embeddings for the PDF text
def get_embeddings(text_data):
texts = [entry['content'] for entry in text_data]
return embedding_model.encode(texts, convert_to_tensor=False)
# Function to perform KNN or cosine similarity search
def search_pdf_content(pdf_text_data, query, search_type="knn", k=5):
query_embedding = embedding_model.encode([query])[0]
pdf_embeddings = get_embeddings(pdf_text_data)
if search_type == "knn":
index = faiss.IndexFlatL2(pdf_embeddings.shape[1])
index.add(pdf_embeddings.astype('float32'))
distances, indices = index.search(np.array([query_embedding], dtype='float32'), k)
return [pdf_text_data[i] for i in indices[0]]
elif search_type == "cosine":
pdf_embeddings_norm = pdf_embeddings / np.linalg.norm(pdf_embeddings, axis=1, keepdims=True)
query_embedding_norm = query_embedding / np.linalg.norm(query_embedding)
similarities = np.dot(pdf_embeddings_norm, query_embedding_norm)
top_indices = np.argsort(similarities)[-k:][::-1]
return [pdf_text_data[i] for i in top_indices]
# Streamlit UI
st.title("PDF Search with LLM and Semantic Search")
pdf_file = st.file_uploader("Upload a PDF file", type="pdf")
search_query = st.text_input("Enter your search query")
search_method = st.radio("Select Search Method", ("knn", "cosine"))
k_value = st.slider("Number of Results (K)", min_value=1, max_value=20, value=5)
if pdf_file and search_query:
pdf_text_data = extract_text_from_pdf(pdf_file)
results = search_pdf_content(pdf_text_data, search_query, search_type=search_method, k=k_value)
st.write("### Search Results")
for res in results:
st.write(f"**Page {res['page']}, Line {res['line']}:** {res['content']}")
|