Spaces:
Runtime error
Runtime error
File size: 1,758 Bytes
3f9aad8 fba35f8 3f9aad8 fba35f8 3f9aad8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
import re
import os
import requests
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
def extract_repo_details(github_url):
"""Extracts repo owner, repo name, and file path from a GitHub URL."""
match = re.search(r"github\.com/([^/]+)/([^/]+)/blob/main/(.+)", github_url)
if not match:
raise ValueError(f"Invalid GitHub URL format: {github_url}")
repo_owner, repo_name, file_path = match.groups()
return repo_owner, repo_name, file_path
def fetch_md_file_via_api(repo_owner, repo_name, file_path, token):
"""Fetches a Markdown file from GitHub API."""
api_url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{file_path}"
headers = {
'Authorization': f'token {token}',
'Accept': 'application/vnd.github.v3.raw'
}
try:
response = requests.get(api_url, headers=headers)
response.raise_for_status()
return response.text
except requests.exceptions.RequestException as e:
print(f"Failed to fetch {file_path}. Error: {str(e)}")
return None
def data_loader(data):
loader = DirectoryLoader(
data,
glob=("*.md"),
)
return loader.load()
def chunk_text(extracted_data):
text_spliter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=100)
text_chunk = text_spliter.split_documents(extracted_data)
return text_chunk
def download_hugging_face_embeddings():
# Using HuggingFaceEmbeddings from Langchain to load the model
embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/multi-qa-MiniLM-L6-cos-v1')
return embeddings |