Spaces:
Runtime error
Runtime error
import re | |
import os | |
import requests | |
from langchain.document_loaders import DirectoryLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.embeddings import HuggingFaceEmbeddings | |
def extract_repo_details(github_url): | |
"""Extracts repo owner, repo name, and file path from a GitHub URL.""" | |
match = re.search(r"github\.com/([^/]+)/([^/]+)/blob/main/(.+)", github_url) | |
if not match: | |
raise ValueError(f"Invalid GitHub URL format: {github_url}") | |
repo_owner, repo_name, file_path = match.groups() | |
return repo_owner, repo_name, file_path | |
def fetch_md_file_via_api(repo_owner, repo_name, file_path, token): | |
"""Fetches a Markdown file from GitHub API.""" | |
api_url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{file_path}" | |
headers = { | |
'Authorization': f'token {token}', | |
'Accept': 'application/vnd.github.v3.raw' | |
} | |
try: | |
response = requests.get(api_url, headers=headers) | |
response.raise_for_status() | |
return response.text | |
except requests.exceptions.RequestException as e: | |
print(f"Failed to fetch {file_path}. Error: {str(e)}") | |
return None | |
def data_loader(data): | |
loader = DirectoryLoader( | |
data, | |
glob=("*.md"), | |
) | |
return loader.load() | |
def chunk_text(extracted_data): | |
text_spliter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=100) | |
text_chunk = text_spliter.split_documents(extracted_data) | |
return text_chunk | |
def download_hugging_face_embeddings(): | |
# Using HuggingFaceEmbeddings from Langchain to load the model | |
embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/multi-qa-MiniLM-L6-cos-v1') | |
return embeddings |