SergeyO7 commited on
Commit
8edd424
·
verified ·
1 Parent(s): 79bbc48

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -21
app.py CHANGED
@@ -1,44 +1,36 @@
1
- # from langchain.document_loaders import DirectoryLoader
2
- from langchain_community.document_loaders import DirectoryLoader
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
  from langchain.schema import Document
5
- # from langchain.embeddings import OpenAIEmbeddings
6
  from langchain_openai import OpenAIEmbeddings
7
  from langchain_community.vectorstores import Chroma
8
- import openai
9
-
10
  from dotenv import load_dotenv
11
  import os
12
- import shutil
13
 
14
- # Load environment variables. Assumes that project contains .env file with API keys
15
  load_dotenv()
16
- #---- Set OpenAI API key
17
- # Change environment variable name from "OPENAI_API_KEY" to the name given in
18
- # your .env file.
19
- openai.api_key = os.environ['OPENAI_API_KEY']
20
 
21
  CHROMA_PATH = "chroma"
22
- DATA_PATH = ""
23
 
24
-
25
- def
26
- main():
27
  generate_data_store()
28
 
29
-
30
  def generate_data_store():
31
  documents = load_documents()
32
- chunks = split_text(documents)
33
- save_to_chroma(chunks)
34
-
35
 
36
  def load_documents():
37
- loader = DirectoryLoader(DATA_PATH, glob="pl25032025.md")
 
 
 
 
38
  documents = loader.load()
39
  return documents
40
 
41
-
42
  def split_text(documents: list[Document]):
43
  text_splitter = RecursiveCharacterTextSplitter(
44
  chunk_size=300,
 
1
+ from langchain_community.document_loaders import UnstructuredMarkdownLoader
 
2
  from langchain.text_splitter import RecursiveCharacterTextSplitter
3
  from langchain.schema import Document
 
4
  from langchain_openai import OpenAIEmbeddings
5
  from langchain_community.vectorstores import Chroma
 
 
6
  from dotenv import load_dotenv
7
  import os
 
8
 
9
+ # Load environment variables
10
  load_dotenv()
11
+ # Assumes OPENAI_API_KEY is set in .env
 
 
 
12
 
13
  CHROMA_PATH = "chroma"
14
+ DATA_PATH = "" # Update this to your actual data path
15
 
16
+ def main():
 
 
17
  generate_data_store()
18
 
 
19
  def generate_data_store():
20
  documents = load_documents()
21
+ if documents:
22
+ chunks = split_text(documents)
23
+ save_to_chroma(chunks)
24
 
25
  def load_documents():
26
+ file_path = os.path.join(DATA_PATH, "pl25032025.md")
27
+ if not os.path.exists(file_path):
28
+ print(f"Error: File {file_path} not found.")
29
+ return []
30
+ loader = UnstructuredMarkdownLoader(file_path)
31
  documents = loader.load()
32
  return documents
33
 
 
34
  def split_text(documents: list[Document]):
35
  text_splitter = RecursiveCharacterTextSplitter(
36
  chunk_size=300,