Spaces:
Sleeping
Sleeping
File size: 3,637 Bytes
6d0a4ef de6b3b3 6d0a4ef de6b3b3 6d0a4ef de6b3b3 6d0a4ef de6b3b3 6d0a4ef de6b3b3 6d0a4ef de6b3b3 6d0a4ef de6b3b3 6d0a4ef de6b3b3 6d0a4ef de6b3b3 6d0a4ef de6b3b3 6d0a4ef de6b3b3 6d0a4ef de6b3b3 a7cdcc5 c82c449 de6b3b3 6d0a4ef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import streamlit as st
import PyPDF2
import pptx
import os
from langchain.llms import HuggingFaceHub
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.embeddings import OpenAIEmbeddings
import cassio
from langchain.text_splitter import CharacterTextSplitter
from huggingface_hub import login
# Secure API keys (ensure they are set)
ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_DB_ID = os.getenv("ASTRA_DB_ID")
HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not ASTRA_DB_APPLICATION_TOKEN or not ASTRA_DB_ID:
st.error("Astra DB credentials are missing. Set the environment variables.")
st.stop()
if not HUGGINGFACE_API_KEY:
st.error("Hugging Face API key is missing. Set the HUGGINGFACE_API_KEY environment variable.")
st.stop()
if not OPENAI_API_KEY:
st.error("OpenAI API key is missing. Set the OPENAI_API_KEY environment variable.")
st.stop()
# Initialize Astra DB connection
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)
# Initialize LLM & Embeddings
login(token=HUGGINGFACE_API_KEY)
hf_llm = HuggingFaceHub(
repo_id="google/flan-t5-large",
model_kwargs={"temperature": 0, "max_length": 64},
huggingfacehub_api_token=HUGGINGFACE_API_KEY
)
embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
# Initialize vector store
astra_vector_store = Cassandra(embedding=embedding, table_name="qa_mini_demo")
def extract_text_from_pdf(uploaded_file):
"""Extract text from a PDF file."""
text = ""
try:
pdf_reader = PyPDF2.PdfReader(uploaded_file)
for page in pdf_reader.pages:
page_text = page.extract_text()
if page_text: # Avoid NoneType error
text += page_text + "\n"
except Exception as e:
st.error(f"Error reading PDF: {e}")
return text
def extract_text_from_ppt(uploaded_file):
"""Extract text from a PowerPoint file."""
text = ""
try:
presentation = pptx.Presentation(uploaded_file)
for slide in presentation.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
text += shape.text + "\n"
except Exception as e:
st.error(f"Error reading PPT: {e}")
return text
def main():
st.title("Chat with Documents")
uploaded_file = st.file_uploader("Upload a PDF or PPT file", type=["pdf", "pptx"])
extract_button = st.button("Extract Text")
extracted_text = ""
if extract_button and uploaded_file is not None:
if uploaded_file.name.endswith(".pdf"):
extracted_text = extract_text_from_pdf(uploaded_file)
elif uploaded_file.name.endswith(".pptx"):
extracted_text = extract_text_from_ppt(uploaded_file)
if extracted_text:
text_splitter = CharacterTextSplitter(separator="\n", chunk_size=800, chunk_overlap=200, length_function=len)
texts = text_splitter.split_text(extracted_text)
astra_vector_store.add_texts(texts)
st.success("Text extracted and stored successfully!")
# Ensure the vector store index is initialized properly
astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)
query = st.text_input("Enter your query")
submit_query = st.button("Submit Query")
if submit_query and query:
response = astra_vector_index.query(query, llm =hf_llm)
st.write(f"Response: {response}")
if __name__ == "__main__":
main()
|