NEXAS commited on
Commit
fd95cf2
·
verified ·
1 Parent(s): b0d791e

Create utils/ingest_text.py

Browse files
Files changed (1) hide show
  1. utils/ingest_text.py +105 -0
utils/ingest_text.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import nest_asyncio # noqa: E402
3
+ nest_asyncio.apply()
4
+
5
+ # bring in our LLAMA_CLOUD_API_KEY
6
+ from dotenv import load_dotenv
7
+ load_dotenv()
8
+
9
+ ##### LLAMAPARSE #####
10
+ from llama_parse import LlamaParse
11
+
12
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
13
+ from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
14
+ from langchain_community.vectorstores import Qdrant
15
+ from langchain_community.document_loaders import DirectoryLoader
16
+
17
+
18
+ llamaparse_api_key = os.getenv("LLAMA_CLOUD_API_KEY")
19
+ qdrant_url = os.getenv("QDRANT_URL")
20
+ qdrant_api_key = os.getenv("QDRANT_API_KEY")
21
+
22
+ #to_parse_documents = ["./data/example.pdf", "./data/uber_10q_march_2022.pdf"]
23
+
24
+ parsed_data_file = r"data/parsed_data.pkl"
25
+ output_md = r"data/output.md"
26
+ loki = r"data"
27
+
28
+ import pickle
29
+ # Define a function to load parsed data if available, or parse if not
30
+ def load_or_parse_data(loc):
31
+ data_file = parsed_data_file
32
+
33
+ if os.path.exists(data_file):
34
+ # Load the parsed data from the file
35
+ with open(data_file, "rb") as f:
36
+ parsed_data = pickle.load(f)
37
+ else:
38
+ # Perform the parsing step and store the result in llama_parse_documents
39
+ parsingInstructiontest10k = """The provided document can be a brochure , textbook ,guide.
40
+ It contains many images and tables.
41
+ Try to be precise while answering the questions"""
42
+ parser = LlamaParse(api_key=llamaparse_api_key, result_type="markdown", parsing_instruction=parsingInstructiontest10k)
43
+ llama_parse_documents = parser.load_data(loc)
44
+
45
+
46
+ # Save the parsed data to a file
47
+ with open(data_file, "wb") as f:
48
+ pickle.dump(llama_parse_documents, f)
49
+
50
+ # Set the parsed data to the variable
51
+ parsed_data = llama_parse_documents
52
+
53
+ return parsed_data
54
+
55
+
56
+ # Create vector database
57
+ def create_vector_database(loc):
58
+ """
59
+ Creates a vector database using document loaders and embeddings.
60
+ This function loads urls,
61
+ splits the loaded documents into chunks, transforms them into embeddings using OllamaEmbeddings,
62
+ and finally persists the embeddings into a Chroma vector database.
63
+ """
64
+ # Call the function to either load or parse the data
65
+ llama_parse_documents = load_or_parse_data(loc)
66
+ #print(llama_parse_documents[1].text[:100])
67
+
68
+ #with open('data/output.md', 'a') as f: # Open the file in append mode ('a')
69
+ # for doc in llama_parse_documents:
70
+ # f.write(doc.text + '\n')
71
+ with open(output_md, "a", encoding='utf-8') as f: # Open the file in append mode ('a')
72
+ for doc in llama_parse_documents:
73
+ f.write(doc.text + '\n')
74
+
75
+ loader = DirectoryLoader(loki, glob="**/*.md", show_progress=True)
76
+ documents = loader.load()
77
+ # Split loaded documents into chunks
78
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
79
+ docs = text_splitter.split_documents(documents)
80
+
81
+ #len(docs)
82
+ #docs[0]
83
+
84
+ # Initialize Embeddings
85
+ embeddings = FastEmbedEmbeddings()
86
+
87
+ # Create and persist a Chroma vector database from the chunked documents
88
+ qdrant = Qdrant.from_documents(
89
+ documents=docs,
90
+ embedding=embeddings,
91
+ url=qdrant_url,
92
+ collection_name="rag",
93
+ api_key=qdrant_api_key
94
+ )
95
+
96
+ #query it
97
+ #query = "what is the agend of Financial Statements for 2022 ?"
98
+ #found_doc = qdrant.similarity_search(query, k=3)
99
+ #print(found_doc[0][:100])
100
+
101
+ print('Vector DB created successfully !')
102
+
103
+
104
+ if __name__ == "__main__":
105
+ create_vector_database()