joe4ai commited on
Commit
3f9aad8
·
verified ·
1 Parent(s): 42a7506

Create get_data.py

Browse files
Files changed (1) hide show
  1. get_data.py +47 -0
get_data.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ import requests
4
+ from langchain.document_loaders import DirectoryLoader
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ from langchain.embeddings import HuggingFaceEmbeddings
7
+ def extract_repo_details(github_url):
8
+ """Extracts repo owner, repo name, and file path from a GitHub URL."""
9
+ match = re.search(r"github\.com/([^/]+)/([^/]+)/blob/main/(.+)", github_url)
10
+ if not match:
11
+ raise ValueError(f"Invalid GitHub URL format: {github_url}")
12
+
13
+ repo_owner, repo_name, file_path = match.groups()
14
+ return repo_owner, repo_name, file_path
15
+
16
+ def fetch_md_file_via_api(repo_owner, repo_name, file_path, token):
17
+ """Fetches a Markdown file from GitHub API."""
18
+ api_url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{file_path}"
19
+ headers = {
20
+ 'Authorization': f'token {token}',
21
+ 'Accept': 'application/vnd.github.v3.raw'
22
+ }
23
+
24
+ try:
25
+ response = requests.get(api_url, headers=headers)
26
+ response.raise_for_status()
27
+ return response.text
28
+ except requests.exceptions.RequestException as e:
29
+ print(f"Failed to fetch {file_path}. Error: {str(e)}")
30
+ return None
31
+
32
+ def data_loader(data):
33
+ loader = DirectoryLoader(
34
+ data,
35
+ glob=("*.md"),
36
+ )
37
+ return loader.load()
38
+
39
+ def chunk_text(extracted_data):
40
+ text_spliter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
41
+ text_chunk = text_spliter.split_documents(extracted_data)
42
+ return text_chunk
43
+
44
+ def download_hugging_face_embeddings():
45
+ # Using HuggingFaceEmbeddings from Langchain to load the model
46
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
47
+ return embeddings