Spaces:

HumbleBeeAI
/

humblebee-chatbot

Runtime error

App Files Files Community

joe4ai commited on Feb 6

Commit

3f9aad8

verified ·

1 Parent(s): 42a7506

Create get_data.py

Browse files

Files changed (1) hide show

get_data.py +47 -0

get_data.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import re
+import os
+import requests
+from langchain.document_loaders import DirectoryLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.embeddings import HuggingFaceEmbeddings
+def extract_repo_details(github_url):
+    """Extracts repo owner, repo name, and file path from a GitHub URL."""
+    match = re.search(r"github\.com/([^/]+)/([^/]+)/blob/main/(.+)", github_url)
+    if not match:
+        raise ValueError(f"Invalid GitHub URL format: {github_url}")
+    repo_owner, repo_name, file_path = match.groups()
+    return repo_owner, repo_name, file_path
+def fetch_md_file_via_api(repo_owner, repo_name, file_path, token):
+    """Fetches a Markdown file from GitHub API."""
+    api_url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{file_path}"
+    headers = {
+        'Authorization': f'token {token}',
+        'Accept': 'application/vnd.github.v3.raw'
+    }
+    try:
+        response = requests.get(api_url, headers=headers)
+        response.raise_for_status()
+        return response.text
+    except requests.exceptions.RequestException as e:
+        print(f"Failed to fetch {file_path}. Error: {str(e)}")
+        return None
+def data_loader(data):
+    loader = DirectoryLoader(
+        data,
+        glob=("*.md"),
+    )
+    return loader.load()
+def chunk_text(extracted_data):
+    text_spliter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
+    text_chunk = text_spliter.split_documents(extracted_data)
+    return text_chunk
+def download_hugging_face_embeddings():
+    # Using HuggingFaceEmbeddings from Langchain to load the model
+    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+    return embeddings