import re import os import requests from langchain.document_loaders import DirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import HuggingFaceEmbeddings def extract_repo_details(github_url): """Extracts repo owner, repo name, and file path from a GitHub URL.""" match = re.search(r"github\.com/([^/]+)/([^/]+)/blob/main/(.+)", github_url) if not match: raise ValueError(f"Invalid GitHub URL format: {github_url}") repo_owner, repo_name, file_path = match.groups() return repo_owner, repo_name, file_path def fetch_md_file_via_api(repo_owner, repo_name, file_path, token): """Fetches a Markdown file from GitHub API.""" api_url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{file_path}" headers = { 'Authorization': f'token {token}', 'Accept': 'application/vnd.github.v3.raw' } try: response = requests.get(api_url, headers=headers) response.raise_for_status() return response.text except requests.exceptions.RequestException as e: print(f"Failed to fetch {file_path}. Error: {str(e)}") return None def data_loader(data): loader = DirectoryLoader( data, glob=("*.md"), ) return loader.load() def chunk_text(extracted_data): text_spliter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=100) text_chunk = text_spliter.split_documents(extracted_data) return text_chunk def download_hugging_face_embeddings(): # Using HuggingFaceEmbeddings from Langchain to load the model embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/multi-qa-MiniLM-L6-cos-v1') return embeddings