Ferhan taha commited on
Commit
a13979d
·
verified ·
1 Parent(s): fb0c6ff

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +134 -0
app.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """app.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/14JJlKx1Oj4px4gdVwHn55FstUl2Dvh9z
8
+ """
9
+
10
+
11
+
12
+ #|export
13
+ import os
14
+
15
+ from langchain.document_loaders import PyPDFLoader
16
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
17
+ from langchain.vectorstores import Chroma
18
+ from langchain.chains import ConversationalRetrievalChain
19
+ from langchain.embeddings import HuggingFaceEmbeddings
20
+ from langchain.llms import HuggingFacePipeline
21
+ from langchain.chains import ConversationChain
22
+ from langchain.memory import ConversationBufferMemory
23
+ from langchain.llms import HuggingFaceHub
24
+ import pandas as pd
25
+ from pathlib import Path
26
+ import chromadb
27
+ import gradio as gr
28
+ from transformers import AutoTokenizer
29
+ import transformers
30
+ import torch
31
+ import tqdm
32
+ import accelerate
33
+
34
+ #|export
35
+ def initialize_database(file_path):
36
+ # Create list of documents (when valid)
37
+ collection_name = Path(file_path).stem
38
+ # Fix potential issues from naming convention
39
+ ## Remove space
40
+ collection_name = collection_name.replace(" ","-")
41
+ ## Limit lenght to 50 characters
42
+ collection_name = collection_name[:50]
43
+ ## Enforce start and end as alphanumeric character
44
+ if not collection_name[0].isalnum():
45
+ collection_name[0] = 'A'
46
+ if not collection_name[-1].isalnum():
47
+ collection_name[-1] = 'Z'
48
+ # print('list_file_path: ', list_file_path)
49
+ print('Collection name: ', collection_name)
50
+ # Load document and create splits
51
+ doc_splits = load_doc(file_path)
52
+ # Create or load vector database
53
+ # global vector_db
54
+ vector_db = create_db(doc_splits, collection_name)
55
+ return vector_db, collection_name, "Complete!"
56
+
57
+ #|export
58
+ def load_doc(file_path):
59
+ loader = PyPDFLoader(file_path)
60
+ pages = loader.load()
61
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size = 600, chunk_overlap = 50)
62
+ doc_splits = text_splitter.split_documents(pages)
63
+ return doc_splits
64
+
65
+ #|export
66
+ def create_db(splits, collection_name):
67
+ embedding = HuggingFaceEmbeddings()
68
+ new_client = chromadb.EphemeralClient()
69
+ vectordb = Chroma.from_documents(
70
+ documents=splits,
71
+ embedding=embedding,
72
+ client=new_client,
73
+ collection_name=collection_name,
74
+ # persist_directory=default_persist_directory
75
+ )
76
+ return vectordb
77
+
78
+ #|export
79
+ splt = load_doc('/content/data.pdf')
80
+
81
+ #|export
82
+ vec = initialize_database('/content/data.pdf')
83
+
84
+ #|export
85
+ vec_cre = create_db(splt, 'data')
86
+ vec_cre
87
+
88
+ #|export
89
+ def initialize_llmchain(temperature, max_tokens, top_k, vector_db):
90
+ memory = ConversationBufferMemory(
91
+ memory_key="chat_history",
92
+ output_key='answer',
93
+ return_messages=True
94
+ )
95
+
96
+ llm = HuggingFaceHub(
97
+ repo_id='mistralai/Mixtral-8x7B-Instruct-v0.1',
98
+ model_kwargs={"temperature": temperature, "max_new_tokens": max_tokens, "top_k": top_k, "load_in_8bit": True}
99
+ )
100
+ retriever=vector_db.as_retriever()
101
+ qa_chain = ConversationalRetrievalChain.from_llm(
102
+ llm,
103
+ retriever=retriever,
104
+ chain_type="stuff",
105
+ memory=memory,
106
+ # combine_docs_chain_kwargs={"prompt": your_prompt})
107
+ return_source_documents=True,
108
+ #return_generated_question=False,
109
+ verbose=False,
110
+ )
111
+ return qa_chain
112
+
113
+ #|export
114
+ qa = initialize_llmchain(0.7, 1024, 1, vec_cre)
115
+
116
+ #|export
117
+ def format_chat_history(message, chat_history):
118
+ formatted_chat_history = []
119
+ for user_message, bot_message in chat_history:
120
+ formatted_chat_history.append(f"User: {user_message}")
121
+ formatted_chat_history.append(f"Assistant: {bot_message}")
122
+ return formatted_chat_history
123
+
124
+ #|export
125
+ def conversation(message, history):
126
+ formatted_chat_history = format_chat_history(message, history)
127
+ response = qa({"question": message, "chat_history": formatted_chat_history})
128
+ response_answer = response["answer"]
129
+ if response_answer.find("Helpful Answer:") != -1:
130
+ response_answer = response_answer.split("Helpful Answer:")[-1]
131
+ return response_answer
132
+
133
+ #|export
134
+ gr.ChatInterface(conversation).launch()