File size: 2,913 Bytes
628527c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import streamlit as st
import PyPDF2
from langchain.llms import HuggingFaceHub
import pptx
import os
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.embeddings import OpenAIEmbeddings
import cassio
from langchain.text_splitter import CharacterTextSplitter




# Secure API keys (replace with environment variables in deployment)
ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_DB_ID = os.getenv("ASTRA_DB_ID")
HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


# Initialize Astra DB connection
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

# Initialize LLM & Embeddings
hf_llm = HuggingFaceHub(repo_id="google/flan-t5-large", model_kwargs={"temperature": 0, "max_length": 64})
embedding =OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# Initialize vector store
astra_vector_store = Cassandra(embedding=embedding, table_name="qa_mini_demo")

def extract_text_from_pdf(uploaded_file):
    """Extract text from a PDF file."""
    text = ""
    pdf_reader = PyPDF2.PdfReader(uploaded_file)
    for page in pdf_reader.pages:
        page_text = page.extract_text()
        if page_text:  # Avoid NoneType error
            text += page_text + "\n"
    return text

def extract_text_from_ppt(uploaded_file):
    """Extract text from a PowerPoint file."""
    text = ""
    presentation = pptx.Presentation(uploaded_file)
    for slide in presentation.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text += shape.text + "\n"
    return text

def main():
    st.title("Chat with Documents")

    uploaded_file = st.file_uploader("Upload a PDF or PPT file", type=["pdf", "pptx"])
    extract_button = st.button("Extract Text")

    extracted_text = ""
    if extract_button and uploaded_file is not None:
        if uploaded_file.name.endswith(".pdf"):
            extracted_text = extract_text_from_pdf(uploaded_file)
        elif uploaded_file.name.endswith(".pptx"):
            extracted_text = extract_text_from_ppt(uploaded_file)

        if extracted_text:
            text_splitter = CharacterTextSplitter(separator="\n", chunk_size=800, chunk_overlap=200, length_function=len)
            texts = text_splitter.split_text(extracted_text)
            astra_vector_store.add_texts(texts)

    # Ensure the vector store index is initialized properly
    astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

    query = st.text_input("Enter your query")
    submit_query = st.button("Submit Query")
    if submit_query:

   
        value = astra_vector_index.query(query, llm=hf_llm)

        st.write(f"Response: {value}")

if __name__ == "__main__":
    main()