Spaces:
Build error
Build error
Docker-python
Browse files
app.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import asyncio
|
3 |
+
from typing import List
|
4 |
+
from chainlit.types import AskFileResponse
|
5 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
6 |
+
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
|
7 |
+
from langchain_community.vectorstores import Chroma
|
8 |
+
from langchain_community.embeddings import OpenAIEmbeddings
|
9 |
+
from langchain.chat_models import ChatOpenAI
|
10 |
+
from langchain.schema import SystemMessage, HumanMessage
|
11 |
+
from PyPDF2 import PdfReader
|
12 |
+
import chainlit as cl
|
13 |
+
|
14 |
+
# Set up API key
|
15 |
+
os.environ["OPENAI_API_KEY"] = "your-api-key-here" # You'll need to handle this securely
|
16 |
+
|
17 |
+
# Set up prompts
|
18 |
+
system_template = "Use the following context to answer a user's question. If you cannot find the answer in the context, say you don't know the answer."
|
19 |
+
system_message_prompt = SystemMessagePromptTemplate.from_template(system_template)
|
20 |
+
|
21 |
+
human_template = "Context:\n{context}\n\nQuestion:\n{question}"
|
22 |
+
human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)
|
23 |
+
|
24 |
+
chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
|
25 |
+
|
26 |
+
class RetrievalAugmentedQAPipeline:
|
27 |
+
def __init__(self, llm: ChatOpenAI, vector_db: Chroma) -> None:
|
28 |
+
self.llm = llm
|
29 |
+
self.vector_db = vector_db
|
30 |
+
|
31 |
+
async def arun_pipeline(self, user_query: str):
|
32 |
+
context_docs = self.vector_db.similarity_search(user_query, k=2)
|
33 |
+
context_list = [doc.page_content for doc in context_docs]
|
34 |
+
context_prompt = "\n".join(context_list)
|
35 |
+
|
36 |
+
max_context_length = 12000
|
37 |
+
if len(context_prompt) > max_context_length:
|
38 |
+
context_prompt = context_prompt[:max_context_length]
|
39 |
+
|
40 |
+
messages = chat_prompt.format_prompt(context=context_prompt, question=user_query).to_messages()
|
41 |
+
|
42 |
+
async for chunk in self.llm.astream(messages):
|
43 |
+
yield chunk.content
|
44 |
+
|
45 |
+
def process_pdf(file: AskFileResponse) -> List[str]:
|
46 |
+
pdf_reader = PdfReader(file.content)
|
47 |
+
text = "\n".join([page.extract_text() for page in pdf_reader.pages])
|
48 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=40)
|
49 |
+
return text_splitter.split_text(text)
|
50 |
+
|
51 |
+
@cl.on_chat_start
|
52 |
+
async def on_chat_start():
|
53 |
+
files = await cl.AskFileMessage(
|
54 |
+
content="Please upload a PDF file to begin!",
|
55 |
+
accept=["application/pdf"],
|
56 |
+
max_size_mb=20,
|
57 |
+
).send()
|
58 |
+
|
59 |
+
if not files:
|
60 |
+
await cl.Message(content="No file was uploaded. Please try again.").send()
|
61 |
+
return
|
62 |
+
|
63 |
+
file = files[0]
|
64 |
+
msg = cl.Message(content=f"Processing `{file.name}`...")
|
65 |
+
await msg.send()
|
66 |
+
|
67 |
+
texts = process_pdf(file)
|
68 |
+
|
69 |
+
embeddings = OpenAIEmbeddings()
|
70 |
+
vector_db = Chroma.from_texts(texts, embeddings)
|
71 |
+
|
72 |
+
chat_openai = ChatOpenAI()
|
73 |
+
retrieval_augmented_qa_pipeline = RetrievalAugmentedQAPipeline(vector_db=vector_db, llm=chat_openai)
|
74 |
+
|
75 |
+
cl.user_session.set("pipeline", retrieval_augmented_qa_pipeline)
|
76 |
+
|
77 |
+
msg.content = f"Processing `{file.name}` done. You can now ask questions!"
|
78 |
+
await msg.update()
|
79 |
+
|
80 |
+
@cl.on_message
|
81 |
+
async def main(message: cl.Message):
|
82 |
+
pipeline = cl.user_session.get("pipeline")
|
83 |
+
if not pipeline:
|
84 |
+
await cl.Message(content="Please upload a PDF file first.").send()
|
85 |
+
return
|
86 |
+
|
87 |
+
msg = cl.Message(content="")
|
88 |
+
async for chunk in pipeline.arun_pipeline(message.content):
|
89 |
+
await msg.stream_token(chunk)
|
90 |
+
|
91 |
+
await msg.send()
|