File size: 1,758 Bytes
3f9aad8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fba35f8
3f9aad8
 
 
 
 
fba35f8
3f9aad8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import re
import os
import requests
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter 
from langchain.embeddings import HuggingFaceEmbeddings
def extract_repo_details(github_url):
    """Extracts repo owner, repo name, and file path from a GitHub URL."""
    match = re.search(r"github\.com/([^/]+)/([^/]+)/blob/main/(.+)", github_url)
    if not match:
        raise ValueError(f"Invalid GitHub URL format: {github_url}")
    
    repo_owner, repo_name, file_path = match.groups()
    return repo_owner, repo_name, file_path

def fetch_md_file_via_api(repo_owner, repo_name, file_path, token):
    """Fetches a Markdown file from GitHub API."""
    api_url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{file_path}"
    headers = {
        'Authorization': f'token {token}',
        'Accept': 'application/vnd.github.v3.raw' 
    }
    
    try:
        response = requests.get(api_url, headers=headers)
        response.raise_for_status()
        return response.text  
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch {file_path}. Error: {str(e)}")
        return None
    
def data_loader(data):
    loader = DirectoryLoader(
        data,
        glob=("*.md"),
    )
    return loader.load()

def chunk_text(extracted_data):
    text_spliter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=100)
    text_chunk = text_spliter.split_documents(extracted_data)
    return text_chunk

def download_hugging_face_embeddings():
    # Using HuggingFaceEmbeddings from Langchain to load the model
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/multi-qa-MiniLM-L6-cos-v1')
    return embeddings