Shahabmoin commited on
Commit
99bb0bf
·
verified ·
1 Parent(s): 428ae9d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -0
app.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from groq import Groq
4
+ import fitz # PyMuPDF for PDF parsing
5
+ import numpy as np
6
+ import faiss
7
+ from sentence_transformers import SentenceTransformer # Hugging Face transformer
8
+
9
+ # Initialize the Hugging Face model and Groq API client
10
+ model = SentenceTransformer('all-MiniLM-L6-v2') # Model for generating embeddings
11
+ GROQ_API_KEY = "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"
12
+ client = Groq(api_key="GROQ_API_KEY")
13
+
14
+ # Function to extract text from a PDF
15
+ def extract_text_from_pdf(pdf_path):
16
+ doc = fitz.open(pdf_path)
17
+ text = ""
18
+ for page in doc:
19
+ text += page.get_text()
20
+ return text
21
+
22
+ # Function to generate embeddings using Hugging Face model (for text retrieval)
23
+ def generate_huggingface_embeddings(text):
24
+ embeddings = model.encode(text) # Using the SentenceTransformer model
25
+ return embeddings
26
+
27
+ # Function to get relevant chunks from the document using FAISS similarity search
28
+ def get_relevant_chunks(query, top_k=5):
29
+ query_embedding = generate_huggingface_embeddings(query) # Get query embedding
30
+ query_embedding = np.array(query_embedding).reshape(1, -1) # Reshape for FAISS
31
+
32
+ # Perform similarity search in FAISS
33
+ distances, indices = index.search(query_embedding, top_k)
34
+ relevant_chunks = [document_chunks[i] for i in indices[0]]
35
+ return relevant_chunks
36
+
37
+ # Function to generate an answer based on retrieved context and Groq's model
38
+ def generate_answer(query):
39
+ relevant_chunks = get_relevant_chunks(query)
40
+ context = " ".join(relevant_chunks) # Combine the most relevant chunks
41
+
42
+ # Generate the response with Groq's chat model
43
+ chat_completion = client.chat.completions.create(
44
+ messages=[{"role": "user", "content": f"Answer based on this: {context}"}],
45
+ model="llama3-8b-8192", # Adjust with the appropriate Groq model
46
+ stream=False
47
+ )
48
+ return chat_completion.choices[0].message.content
49
+
50
+ # Streamlit app interface
51
+ st.title("Knowledge-Based Assistant")
52
+ st.write("Upload a PDF to generate answers based on its content.")
53
+
54
+ # Upload PDF file
55
+ pdf_file = st.file_uploader("Choose a PDF file", type="pdf")
56
+
57
+ if pdf_file is not None:
58
+ # Extract the text content from the uploaded PDF
59
+ document_text = extract_text_from_pdf(pdf_file)
60
+
61
+ # Split the document into chunks (adjust chunk size as needed)
62
+ chunk_size = 1000 # Size of each chunk of text for embedding
63
+ document_chunks = [document_text[i:i+chunk_size] for i in range(0, len(document_text), chunk_size)]
64
+
65
+ # Generate embeddings for each chunk and store them
66
+ embeddings = [generate_huggingface_embeddings(chunk) for chunk in document_chunks]
67
+
68
+ # Convert embeddings to numpy arrays for FAISS
69
+ embeddings_array = np.array(embeddings)
70
+
71
+ # Initialize FAISS index
72
+ index = faiss.IndexFlatL2(embeddings_array.shape[1]) # L2 distance metric
73
+
74
+ # Add embeddings to the FAISS index
75
+ index.add(embeddings_array)
76
+
77
+ # Query input from user
78
+ query = st.text_input("Ask a question about the document:")
79
+
80
+ if query:
81
+ # Generate the answer based on the query
82
+ answer = generate_answer(query)
83
+ st.write("Answer: ", answer)