File size: 3,676 Bytes
7b666bb
c0a164f
d2c0564
5e06280
a0f23a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15f5963
a0f23a4
 
 
 
 
 
 
 
 
 
 
 
 
 
7b666bb
d2c0564
 
 
 
a0f23a4
d2c0564
 
a1fd273
5e06280
 
a1fd273
 
a0f23a4
 
 
 
 
 
 
 
a1fd273
5e06280
a0f23a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import streamlit as st
import os
from langdetect import detect
from PyPDF2 import PdfReader
import requests
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Load the API key from Streamlit secrets
API_KEY = st.secrets["Key2"]
API_URL = "https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-alpha"

# Load the embedding model for semantic search
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to query the LLM via Hugging Face Inference API
def query_llm_api(prompt, max_new_tokens=1000, temperature=0.7, top_k=50):
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json",
    }
    payload = {
        "inputs": prompt,
        "parameters": {
            "max_new_tokens": max_new_tokens,
            "temperature": temperature,
            "top_k": top_k,
        },
    }
    response = requests.post(API_URL, headers=headers, json=payload)
    if response.status_code == 200:
        return response.json()["generated_text"]
    else:
        st.error(f"Error querying the API: {response.status_code}, {response.text}")
        return None

# Function to detect language
def detect_language(text):
    try:
        return detect(text)
    except Exception:
        return "en"  # Default to English if detection fails

# Function to extract text from PDF with line and page numbers
def extract_text_from_pdf(pdf_file):
    pdf_reader = PdfReader(pdf_file)
    text_data = []
    for page_num, page in enumerate(pdf_reader.pages):
        if page.extract_text():
            lines = page.extract_text().split('\n')
            for line_num, line in enumerate(lines):
                text_data.append({
                    "page": page_num + 1,
                    "line": line_num + 1,
                    "content": line
                })
    return text_data

# Function to create embeddings for the PDF text
def get_embeddings(text_data):
    texts = [entry['content'] for entry in text_data]
    return embedding_model.encode(texts, convert_to_tensor=False)

# Function to perform KNN or cosine similarity search
def search_pdf_content(pdf_text_data, query, search_type="knn", k=5):
    query_embedding = embedding_model.encode([query])[0]
    pdf_embeddings = get_embeddings(pdf_text_data)

    if search_type == "knn":
        index = faiss.IndexFlatL2(pdf_embeddings.shape[1])
        index.add(pdf_embeddings.astype('float32'))
        distances, indices = index.search(np.array([query_embedding], dtype='float32'), k)
        return [pdf_text_data[i] for i in indices[0]]

    elif search_type == "cosine":
        pdf_embeddings_norm = pdf_embeddings / np.linalg.norm(pdf_embeddings, axis=1, keepdims=True)
        query_embedding_norm = query_embedding / np.linalg.norm(query_embedding)
        similarities = np.dot(pdf_embeddings_norm, query_embedding_norm)
        top_indices = np.argsort(similarities)[-k:][::-1]
        return [pdf_text_data[i] for i in top_indices]

# Streamlit UI
st.title("PDF Search with LLM and Semantic Search")

pdf_file = st.file_uploader("Upload a PDF file", type="pdf")
search_query = st.text_input("Enter your search query")

search_method = st.radio("Select Search Method", ("knn", "cosine"))
k_value = st.slider("Number of Results (K)", min_value=1, max_value=20, value=5)

if pdf_file and search_query:
    pdf_text_data = extract_text_from_pdf(pdf_file)
    results = search_pdf_content(pdf_text_data, search_query, search_type=search_method, k=k_value)

    st.write("### Search Results")
    for res in results:
        st.write(f"**Page {res['page']}, Line {res['line']}:** {res['content']}")