Spaces:

reab5555
/

DocuQA-Chat

Runtime error

File size: 5,531 Bytes

7ba5af2

import os
import gradio as gr
import faiss
import numpy as np
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_core.documents import Document
from PyPDF2 import PdfReader
from langchain_anthropic import ChatAnthropic

API_KEY = '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'
llm = ChatAnthropic(model="claude-3-5-sonnet-20240620", temperature=0.5, max_tokens=8192, anthropic_api_key=API_KEY)

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

vector_store = None


def process_file(file_path):
    _, ext = os.path.splitext(file_path)
    try:
        if ext.lower() == '.txt':
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
        elif ext.lower() == '.docx':
            with open(file_path, 'rb') as file:
                content = file.read()
                text = content.decode('utf-8', errors='ignore')
        elif ext.lower() == '.pdf':
            with open(file_path, 'rb') as file:
                pdf_reader = PdfReader(file)
                text = '\n'.join([page.extract_text() for page in pdf_reader.pages if page.extract_text()])
        else:
            print(f"Unsupported file type: {ext}")
            return None

        return [Document(page_content=text, metadata={"source": file_path})]
    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")
        return None


def process_files(file_list, progress=gr.Progress()):
    global vector_store
    documents = []
    total_files = len(file_list)

    for i, file in enumerate(file_list):
        progress((i + 1) / total_files, f"Processing file {i + 1} of {total_files}")
        if file.name.lower().endswith(('.txt', '.docx', '.pdf')):
            docs = process_file(file.name)
            if docs:
                documents.extend(docs)

    if not documents:
        return "No documents were successfully processed. Please check your files and try again."

    progress(0.5, "Splitting text")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=200)
    texts = text_splitter.split_documents(documents)

    progress(0.7, "Creating embeddings")
    vector_store = FAISS.from_documents(texts, embeddings)

    progress(0.9, "Saving vector store")
    vector_store.save_local("faiss_index")

    progress(1.0, "Completed")
    return f"Embedding process completed and database created. Processed {len(documents)} files. You can now start chatting!"


def load_existing_index(folder_path):
    global vector_store
    try:
        index_file = os.path.join(folder_path, "index.faiss")
        pkl_file = os.path.join(folder_path, "index.pkl")

        if not os.path.exists(index_file) or not os.path.exists(pkl_file):
            return f"Error: FAISS index files not found in {folder_path}. Please ensure both 'index.faiss' and 'index.pkl' are present."

        vector_store = FAISS.load_local(folder_path, embeddings, allow_dangerous_deserialization=True)
        return f"Successfully loaded existing index from {folder_path}."
    except Exception as e:
        return f"Error loading index: {str(e)}"


def chat(message, history):
    global vector_store
    if vector_store is None:
        return "Please load documents or an existing index first."

    qa_chain = ConversationalRetrievalChain.from_llm(
        llm,
        vector_store.as_retriever(),
        memory=memory
    )

    result = qa_chain.invoke({"question": message, "chat_history": history})
    return result['answer']


def reset_chat():
    global memory
    memory.clear()
    return []


with gr.Blocks() as demo:
    gr.Markdown("# Document-based Chatbot")

    with gr.Row():
        with gr.Column():
            file_input = gr.File(label="Select Files", file_count="multiple", file_types=[".pdf", ".docx", ".txt"])
            process_button = gr.Button("Process Files")
        with gr.Column():
            index_folder = gr.Textbox(label="Existing Index Folder Path",
                                      value="C:\\Works\\Data\\projects\\Python\\QA_Chatbot\\faiss_index")
            load_index_button = gr.Button("Load Existing Index")

    output = gr.Textbox(label="Processing Output")

    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    send = gr.Button("Send")
    clear = gr.Button("Clear")


    def process_selected_files(files):
        if files:
            return process_files(files)
        else:
            return "No files selected. Please select files and try again."


    def load_selected_index(folder_path):
        return load_existing_index(folder_path)


    process_button.click(process_selected_files, file_input, output)
    load_index_button.click(load_selected_index, index_folder, output)


    def respond(message, chat_history):
        bot_message = chat(message, chat_history)
        chat_history.append((message, bot_message))
        return "", chat_history


    msg.submit(respond, [msg, chatbot], [msg, chatbot])
    send.click(respond, [msg, chatbot], [msg, chatbot])
    clear.click(reset_chat, None, chatbot)

if __name__ == "__main__":
    demo.launch()