import requests from bs4 import BeautifulSoup import base64 import re import os from urllib.parse import urlparse def is_github_url(url): """Check if a URL is a GitHub repository URL.""" parsed = urlparse(url) return parsed.netloc in ['github.com', 'www.github.com'] def parse_github_url(url): """Extract owner and repo from GitHub URL.""" parts = url.strip('/').split('/') if 'github.com' in parts: idx = parts.index('github.com') if len(parts) > idx + 2: owner = parts[idx + 1] repo = parts[idx + 2] return owner, repo return None, None def get_repo_content(url, auth=None): """ Get content from a GitHub repository using GitHub's API. Returns a dictionary of filenames and their content. Args: url: GitHub repository URL auth: Optional tuple of (username, token) for authentication """ owner, repo = parse_github_url(url) if not owner or not repo: return {"error": "Invalid GitHub URL format"} try: # Fetch repository contents api_url = f"https://api.github.com/repos/{owner}/{repo}/contents" headers = {} # Add authentication if provided if auth and len(auth) == 2: username, token = auth auth_header = base64.b64encode(f"{username}:{token}".encode()).decode() headers["Authorization"] = f"Basic {auth_header}" response = requests.get(api_url, headers=headers) response.raise_for_status() contents = response.json() repo_content = {} # Process each file/directory for item in contents: if item['type'] == 'file' and item['name'].endswith(('.py', '.js', '.html', '.css', '.md')): # Get file content file_response = requests.get(item['url'], headers=headers) file_response.raise_for_status() file_data = file_response.json() if 'content' in file_data: content = base64.b64decode(file_data['content']).decode('utf-8') repo_content[item['name']] = content # Limit to first 5 files to avoid exceeding API limits if len(repo_content) >= 5: break return repo_content except Exception as e: return {"error": f"Error fetching repository: {str(e)}"} def get_repo_structure(url, auth=None): """ Get the structure of a GitHub repository. Returns a list of file paths in the repository. Args: url: GitHub repository URL auth: Optional tuple of (username, token) for authentication """ owner, repo = parse_github_url(url) if not owner or not repo: return {"error": "Invalid GitHub URL format"} try: # Prepare headers for authentication headers = {} if auth and len(auth) == 2: username, token = auth auth_header = base64.b64encode(f"{username}:{token}".encode()).decode() headers["Authorization"] = f"Basic {auth_header}" # Use GitHub's API to get repository contents api_url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/main?recursive=1" response = requests.get(api_url, headers=headers) # If 'main' branch doesn't exist, try 'master' if response.status_code != 200: api_url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/master?recursive=1" response = requests.get(api_url, headers=headers) response.raise_for_status() data = response.json() # Extract file paths files = [item['path'] for item in data['tree'] if item['type'] == 'blob'] return files except Exception as e: return {"error": f"Error fetching repository structure: {str(e)}"} def get_repo_metadata(url, auth=None): """ Get metadata about a GitHub repository such as description, stars, etc. Args: url: GitHub repository URL auth: Optional tuple of (username, token) for authentication """ owner, repo = parse_github_url(url) if not owner or not repo: return {"error": "Invalid GitHub URL format"} try: # Use GitHub's API to get repository information api_url = f"https://api.github.com/repos/{owner}/{repo}" # Prepare headers for authentication headers = {} if auth and len(auth) == 2: username, token = auth auth_header = base64.b64encode(f"{username}:{token}".encode()).decode() headers["Authorization"] = f"Basic {auth_header}" response = requests.get(api_url, headers=headers) response.raise_for_status() data = response.json() return { "name": data.get("name", ""), "description": data.get("description", ""), "stars": data.get("stargazers_count", 0), "forks": data.get("forks_count", 0), "language": data.get("language", ""), "url": data.get("html_url", "") } except Exception as e: return {"error": f"Error fetching repository metadata: {str(e)}"} def parse_github_pr_url(url): """Extract owner, repo, and PR number from GitHub PR URL.""" pattern = r'https?://github\.com/([^/]+)/([^/]+)/pull/(\d+)' match = re.match(pattern, url) if match: owner, repo, pr_number = match.groups() return owner, repo, pr_number return None, None, None def get_pr_details(pr_url, max_files=25, file_types=None, auth=None): """ Get details of a GitHub Pull Request including changed files and their contents. Returns a dictionary with PR metadata and changes. Args: pr_url: URL of the GitHub PR max_files: Maximum number of files to fetch (default: 25) file_types: List of file extensions to include (default: None = all code files) auth: Optional tuple of (username, token) for authentication """ owner, repo, pr_number = parse_github_pr_url(pr_url) if not owner or not repo or not pr_number: return {"error": "Invalid GitHub PR URL format"} try: # Prepare headers for authentication headers = {} if auth and len(auth) == 2: username, token = auth auth_header = base64.b64encode(f"{username}:{token}".encode()).decode() headers["Authorization"] = f"Basic {auth_header}" # Fetch PR information api_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}" response = requests.get(api_url, headers=headers) response.raise_for_status() pr_data = response.json() # Get PR metadata pr_details = { "title": pr_data.get("title", ""), "description": pr_data.get("body", ""), "user": pr_data.get("user", {}).get("login", ""), "state": pr_data.get("state", ""), "created_at": pr_data.get("created_at", ""), "updated_at": pr_data.get("updated_at", ""), "target_branch": pr_data.get("base", {}).get("ref", ""), "source_branch": pr_data.get("head", {}).get("ref", ""), "changed_files": [], "total_file_count": pr_data.get("changed_files", 0) } # Default file types to include if not specified if file_types is None: file_types = ['.py', '.js', '.html', '.css', '.md', '.java', '.ts', '.jsx', '.tsx', '.go', '.c', '.cpp', '.h', '.hpp', '.json', '.yml', '.yaml', '.sh', '.txt', '.sql'] # Fetch PR changed files with pagination page = 1 while True: files_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}/files?per_page=100&page={page}" files_response = requests.get(files_url, headers=headers) files_response.raise_for_status() files_data = files_response.json() # If no more files, break the loop if not files_data: break # Process each file in this page for file_data in files_data: filename = file_data.get("filename", "") # Skip binary files and non-code files file_ext = os.path.splitext(filename)[1].lower() if file_types and file_ext not in file_types: continue file_info = { "filename": filename, "status": file_data.get("status", ""), # added, modified, removed "additions": file_data.get("additions", 0), "deletions": file_data.get("deletions", 0), "patch": file_data.get("patch", "") } # Add file content if it exists in the PR if file_data.get("status") != "removed": try: file_content_url = f"https://raw.githubusercontent.com/{owner}/{repo}/{pr_data['head']['sha']}/{filename}" content_response = requests.get(file_content_url, headers=headers) if content_response.status_code == 200: file_info["content"] = content_response.text except Exception as e: file_info["content_error"] = str(e) pr_details["changed_files"].append(file_info) # Stop when we reach the maximum number of files if len(pr_details["changed_files"]) >= max_files: break # If we've reached max files or there are no more pages, break if len(pr_details["changed_files"]) >= max_files or len(files_data) < 100: break # Move to next page page += 1 return pr_details except Exception as e: return {"error": f"Error fetching PR details: {str(e)}"} def get_target_branch_code(pr_url, max_files=25, file_types=None, auth=None): """ Get the code from the target branch of a PR. Returns a dictionary of filenames and their content from the target branch. Args: pr_url: URL of the GitHub PR max_files: Maximum number of files to fetch (default: 25) file_types: List of file extensions to include (default: None = all code files) auth: Optional tuple of (username, token) for authentication """ owner, repo, pr_number = parse_github_pr_url(pr_url) if not owner or not repo or not pr_number: return {"error": "Invalid GitHub PR URL format"} try: # Prepare headers for authentication headers = {} if auth and len(auth) == 2: username, token = auth auth_header = base64.b64encode(f"{username}:{token}".encode()).decode() headers["Authorization"] = f"Basic {auth_header}" # First get the PR to find the target branch name api_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}" response = requests.get(api_url, headers=headers) response.raise_for_status() pr_data = response.json() target_branch = pr_data.get("base", {}).get("ref", "main") # Default to main if not found # Default file types to include if not specified if file_types is None: file_types = ['.py', '.js', '.html', '.css', '.md', '.java', '.ts', '.jsx', '.tsx', '.go', '.c', '.cpp', '.h', '.hpp', '.json', '.yml', '.yaml', '.sh', '.txt', '.sql'] # Get files that were changed in the PR with pagination page = 1 target_branch_code = {} while True: files_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}/files?per_page=100&page={page}" files_response = requests.get(files_url, headers=headers) files_response.raise_for_status() files_data = files_response.json() # If no more files, break the loop if not files_data: break # Get the changed filenames from this page for file_data in files_data: filename = file_data.get("filename") # Skip if filename is None or non-matching extension if not filename: continue file_ext = os.path.splitext(filename)[1].lower() if file_types and file_ext not in file_types: continue try: # Get file content from target branch file_url = f"https://raw.githubusercontent.com/{owner}/{repo}/{target_branch}/{filename}" file_response = requests.get(file_url, headers=headers) if file_response.status_code == 200: target_branch_code[filename] = file_response.text except Exception as e: print(f"Error fetching {filename} from target branch: {str(e)}") # Stop when we reach the maximum number of files if len(target_branch_code) >= max_files: break # If we've reached max files or there are no more pages, break if len(target_branch_code) >= max_files or len(files_data) < 100: break # Move to next page page += 1 return target_branch_code except Exception as e: return {"error": f"Error fetching target branch code: {str(e)}"} def verify_github_credentials(username, token): """ Verify GitHub credentials by making a test API call. Returns True if credentials are valid, False otherwise. Args: username: GitHub username token: GitHub personal access token """ try: # Create authentication header auth_header = base64.b64encode(f"{username}:{token}".encode()).decode() headers = {"Authorization": f"Basic {auth_header}"} # Make a test API call to get user information response = requests.get("https://api.github.com/user", headers=headers) # Return True if the request was successful (status code 200) return response.status_code == 200 except Exception as e: print(f"Error verifying GitHub credentials: {str(e)}") return False