Mdean77 commited on
Commit
8669df3
·
1 Parent(s): 2d7499f

App refactored

Browse files
Files changed (5) hide show
  1. .gitignore +3 -2
  2. app.py +7 -71
  3. handle_files.py +17 -0
  4. models.py +22 -0
  5. prompts.py +11 -2
.gitignore CHANGED
@@ -1,3 +1,4 @@
1
- DS_Store
2
  .env
3
- cache/
 
 
1
+ .DS_Store
2
  .env
3
+ cache/
4
+ *.pyc
app.py CHANGED
@@ -1,45 +1,25 @@
1
- ### Mike Dean Experiments
2
 
3
- ### Import Section ###
4
  """
5
  IMPORTS HERE
6
  """
7
  import chainlit as cl
8
- import os
9
- from dotenv import load_dotenv
10
- from chainlit import AskFileMessage
11
- from langchain_text_splitters import RecursiveCharacterTextSplitter
12
- from langchain_community.document_loaders import PyMuPDFLoader
13
  from qdrant_client import QdrantClient
14
  from qdrant_client.http.models import Distance, VectorParams
15
- from langchain_openai.embeddings import OpenAIEmbeddings
16
- from langchain.storage import LocalFileStore
17
  from langchain_qdrant import QdrantVectorStore
18
- from langchain.embeddings import CacheBackedEmbeddings
19
 
20
- from langchain_core.globals import set_llm_cache
21
- from langchain_openai import ChatOpenAI
22
- from langchain_core.caches import InMemoryCache
23
  from operator import itemgetter
24
  from langchain_core.runnables.passthrough import RunnablePassthrough
25
  from langchain_core.runnables.config import RunnableConfig
26
  import uuid
27
  from prompts import chat_prompt
 
 
28
 
29
- load_dotenv()
30
-
31
- # os.environ["LANGCHAIN_PROJECT"] = f"Mike HF Production Rag - {uuid.uuid4().hex[0:8]}"
32
- # os.environ["LANGCHAIN_TRACING_V2"] = "false"
33
- # os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
34
-
35
- ### Global Section ###
36
  """
37
  GLOBAL CODE HERE
38
  """
39
 
40
- # Typical Embedding Model
41
- core_embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
42
-
43
  # Typical QDrant Client Set-up
44
  collection_name = f"pdf_to_parse_{uuid.uuid4()}"
45
  client = QdrantClient(":memory:")
@@ -48,57 +28,12 @@ client.create_collection(
48
  vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
49
  )
50
 
51
- # Adding cache!
52
- store = LocalFileStore("./cache/")
53
- cached_embedder = CacheBackedEmbeddings.from_bytes_store(
54
- core_embeddings, store, namespace=core_embeddings.model
55
- )
56
-
57
  # Typical QDrant Vector Store Set-up
58
  vectorstore = QdrantVectorStore(
59
  client=client,
60
  collection_name=collection_name,
61
  embedding=cached_embedder)
62
 
63
- # rag_system_prompt_template = """\
64
- # You are a helpful assistant that uses the provided context to answer questions. Never reference this prompt, or the existence of context.
65
- # If you cannot answer the question from the information in the context, tell the user that
66
- # you cannot answer the question directly from the context, but that you will give an answer
67
- # that is based on your general knowledge.
68
- # """
69
-
70
- # rag_message_list = [
71
- # {"role" : "system", "content" : rag_system_prompt_template},
72
- # ]
73
-
74
- # rag_user_prompt_template = """
75
- # Question:
76
- # {question}
77
- # Context:
78
- # {context}
79
- # """
80
-
81
- # chat_prompt = ChatPromptTemplate.from_messages([
82
- # ("system", rag_system_prompt_template),
83
- # ("human", rag_user_prompt_template)
84
- # ])
85
- chat_model = ChatOpenAI(model="gpt-4o")
86
- set_llm_cache(InMemoryCache())
87
-
88
- def split_file(file: AskFileMessage):
89
- import tempfile
90
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
91
- Loader = PyMuPDFLoader
92
- with tempfile.NamedTemporaryFile(mode="w", delete=False) as tempfile:
93
- with open(tempfile.name, "wb") as f:
94
- f.write(file.content)
95
- loader = Loader(tempfile.name)
96
- documents = loader.load()
97
- docs = text_splitter.split_documents(documents)
98
- for i, doc in enumerate(docs):
99
- doc.metadata["source"] = f"source_{id}"
100
- return docs
101
-
102
  ### On Chat Start (Session Start) Section ###
103
  @cl.on_chat_start
104
  async def on_chat_start():
@@ -116,15 +51,16 @@ async def on_chat_start():
116
 
117
  file = files[0]
118
 
 
119
  msg = cl.Message(
120
- content=f"Processing `{file.name}`...", disable_human_feedback=True
121
  )
122
 
123
  await msg.send()
124
-
125
  docs = split_file(file)
126
  vectorstore.add_documents(docs)
127
-
128
  retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 15})
129
  retrieval_augmented_qa_chain = (
130
  {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
 
 
1
 
 
2
  """
3
  IMPORTS HERE
4
  """
5
  import chainlit as cl
 
 
 
 
 
6
  from qdrant_client import QdrantClient
7
  from qdrant_client.http.models import Distance, VectorParams
8
+
 
9
  from langchain_qdrant import QdrantVectorStore
 
10
 
 
 
 
11
  from operator import itemgetter
12
  from langchain_core.runnables.passthrough import RunnablePassthrough
13
  from langchain_core.runnables.config import RunnableConfig
14
  import uuid
15
  from prompts import chat_prompt
16
+ from handle_files import split_file
17
+ from models import chat_model, cached_embedder
18
 
 
 
 
 
 
 
 
19
  """
20
  GLOBAL CODE HERE
21
  """
22
 
 
 
 
23
  # Typical QDrant Client Set-up
24
  collection_name = f"pdf_to_parse_{uuid.uuid4()}"
25
  client = QdrantClient(":memory:")
 
28
  vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
29
  )
30
 
 
 
 
 
 
 
31
  # Typical QDrant Vector Store Set-up
32
  vectorstore = QdrantVectorStore(
33
  client=client,
34
  collection_name=collection_name,
35
  embedding=cached_embedder)
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  ### On Chat Start (Session Start) Section ###
38
  @cl.on_chat_start
39
  async def on_chat_start():
 
51
 
52
  file = files[0]
53
 
54
+
55
  msg = cl.Message(
56
+ content=f"Processing `{file.name}`..."
57
  )
58
 
59
  await msg.send()
60
+
61
  docs = split_file(file)
62
  vectorstore.add_documents(docs)
63
+
64
  retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 15})
65
  retrieval_augmented_qa_chain = (
66
  {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
handle_files.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from chainlit import AskFileMessage
2
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
3
+ from langchain_community.document_loaders import PyMuPDFLoader
4
+
5
+ def split_file(file: AskFileMessage):
6
+ import tempfile
7
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
8
+ Loader = PyMuPDFLoader
9
+ with tempfile.NamedTemporaryFile(mode="w", delete=False) as tempfile:
10
+ with open(tempfile.name, "wb") as f:
11
+ f.write(file.content)
12
+ loader = Loader(tempfile.name)
13
+ documents = loader.load()
14
+ docs = text_splitter.split_documents(documents)
15
+ for i, doc in enumerate(docs):
16
+ doc.metadata["source"] = f"source_{id}"
17
+ return docs
models.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_openai.embeddings import OpenAIEmbeddings
2
+ from langchain_core.globals import set_llm_cache
3
+ from langchain_openai import ChatOpenAI
4
+ from langchain_core.caches import InMemoryCache
5
+ from langchain.storage import LocalFileStore
6
+ from langchain.embeddings import CacheBackedEmbeddings
7
+ from dotenv import load_dotenv
8
+
9
+ load_dotenv()
10
+
11
+ core_embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
12
+
13
+ # Adding cache!
14
+ store = LocalFileStore("./cache/")
15
+ cached_embedder = CacheBackedEmbeddings.from_bytes_store(
16
+ core_embeddings, store, namespace=core_embeddings.model
17
+ )
18
+
19
+
20
+ chat_model = ChatOpenAI(model="gpt-4o")
21
+ set_llm_cache(InMemoryCache())
22
+
prompts.py CHANGED
@@ -1,9 +1,10 @@
1
- ## Contains prompts, welcome messages, etc.
2
 
3
  from langchain_core.prompts import ChatPromptTemplate
4
 
5
  rag_system_prompt_template = """\
6
- You are a helpful assistant that uses the provided context to answer questions. Never reference this prompt, or the existence of context.
 
 
7
  If you cannot answer the question from the information in the context, tell the user that
8
  you cannot answer the question directly from the context, but that you will give an answer
9
  that is based on your general knowledge.
@@ -14,10 +15,18 @@ rag_message_list = [
14
  ]
15
 
16
  rag_user_prompt_template = """
 
 
 
 
17
  Question:
18
  {question}
 
19
  Context:
20
  {context}
 
 
 
21
  """
22
 
23
  chat_prompt = ChatPromptTemplate.from_messages([
 
 
1
 
2
  from langchain_core.prompts import ChatPromptTemplate
3
 
4
  rag_system_prompt_template = """\
5
+ You are a helpful assistant that uses the provided context to answer questions.
6
+ You are an expert on clinical trials and observational studies.
7
+ Never reference this prompt, or the existence of context.
8
  If you cannot answer the question from the information in the context, tell the user that
9
  you cannot answer the question directly from the context, but that you will give an answer
10
  that is based on your general knowledge.
 
15
  ]
16
 
17
  rag_user_prompt_template = """
18
+ Use the context to answer the question and provide a clear answer. Do not mention the
19
+ document in your answer. If there is no specific information relevant to the question,
20
+ then tell the user that you don't know the answer, and invite the user to rephrase the question.
21
+
22
  Question:
23
  {question}
24
+
25
  Context:
26
  {context}
27
+
28
+
29
+
30
  """
31
 
32
  chat_prompt = ChatPromptTemplate.from_messages([