Spaces:

MingZ6
/

GitHubHelper

Running

App Files Files Community

GitHubHelper / github_utils.py

MingZ6

init project

b52dd21 4 days ago

raw

history blame contribute delete

15.2 kB

	import requests
	from bs4 import BeautifulSoup
	import base64
	import re
	import os
	from urllib.parse import urlparse

	def is_github_url(url):
	"""Check if a URL is a GitHub repository URL."""
	parsed = urlparse(url)
	return parsed.netloc in ['github.com', 'www.github.com']

	def parse_github_url(url):
	"""Extract owner and repo from GitHub URL."""
	parts = url.strip('/').split('/')
	if 'github.com' in parts:
	idx = parts.index('github.com')
	if len(parts) > idx + 2:
	owner = parts[idx + 1]
	repo = parts[idx + 2]
	return owner, repo
	return None, None

	def get_repo_content(url, auth=None):
	"""
	Get content from a GitHub repository using GitHub's API.
	Returns a dictionary of filenames and their content.

	Args:
	url: GitHub repository URL
	auth: Optional tuple of (username, token) for authentication
	"""
	owner, repo = parse_github_url(url)
	if not owner or not repo:
	return {"error": "Invalid GitHub URL format"}

	try:
	# Fetch repository contents
	api_url = f"https://api.github.com/repos/{owner}/{repo}/contents"
	headers = {}

	# Add authentication if provided
	if auth and len(auth) == 2:
	username, token = auth
	auth_header = base64.b64encode(f"{username}:{token}".encode()).decode()
	headers["Authorization"] = f"Basic {auth_header}"

	response = requests.get(api_url, headers=headers)
	response.raise_for_status()

	contents = response.json()
	repo_content = {}

	# Process each file/directory
	for item in contents:
	if item['type'] == 'file' and item['name'].endswith(('.py', '.js', '.html', '.css', '.md')):
	# Get file content
	file_response = requests.get(item['url'], headers=headers)
	file_response.raise_for_status()
	file_data = file_response.json()

	if 'content' in file_data:
	content = base64.b64decode(file_data['content']).decode('utf-8')
	repo_content[item['name']] = content

	# Limit to first 5 files to avoid exceeding API limits
	if len(repo_content) >= 5:
	break

	return repo_content

	except Exception as e:
	return {"error": f"Error fetching repository: {str(e)}"}

	def get_repo_structure(url, auth=None):
	"""
	Get the structure of a GitHub repository.
	Returns a list of file paths in the repository.

	Args:
	url: GitHub repository URL
	auth: Optional tuple of (username, token) for authentication
	"""
	owner, repo = parse_github_url(url)
	if not owner or not repo:
	return {"error": "Invalid GitHub URL format"}

	try:
	# Prepare headers for authentication
	headers = {}
	if auth and len(auth) == 2:
	username, token = auth
	auth_header = base64.b64encode(f"{username}:{token}".encode()).decode()
	headers["Authorization"] = f"Basic {auth_header}"

	# Use GitHub's API to get repository contents
	api_url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/main?recursive=1"
	response = requests.get(api_url, headers=headers)

	# If 'main' branch doesn't exist, try 'master'
	if response.status_code != 200:
	api_url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/master?recursive=1"
	response = requests.get(api_url, headers=headers)

	response.raise_for_status()
	data = response.json()

	# Extract file paths
	files = [item['path'] for item in data['tree'] if item['type'] == 'blob']
	return files

	except Exception as e:
	return {"error": f"Error fetching repository structure: {str(e)}"}

	def get_repo_metadata(url, auth=None):
	"""
	Get metadata about a GitHub repository such as description, stars, etc.

	Args:
	url: GitHub repository URL
	auth: Optional tuple of (username, token) for authentication
	"""
	owner, repo = parse_github_url(url)
	if not owner or not repo:
	return {"error": "Invalid GitHub URL format"}

	try:
	# Use GitHub's API to get repository information
	api_url = f"https://api.github.com/repos/{owner}/{repo}"

	# Prepare headers for authentication
	headers = {}
	if auth and len(auth) == 2:
	username, token = auth
	auth_header = base64.b64encode(f"{username}:{token}".encode()).decode()
	headers["Authorization"] = f"Basic {auth_header}"

	response = requests.get(api_url, headers=headers)
	response.raise_for_status()

	data = response.json()
	return {
	"name": data.get("name", ""),
	"description": data.get("description", ""),
	"stars": data.get("stargazers_count", 0),
	"forks": data.get("forks_count", 0),
	"language": data.get("language", ""),
	"url": data.get("html_url", "")
	}

	except Exception as e:
	return {"error": f"Error fetching repository metadata: {str(e)}"}

	def parse_github_pr_url(url):
	"""Extract owner, repo, and PR number from GitHub PR URL."""
	pattern = r'https?://github\.com/([^/]+)/([^/]+)/pull/(\d+)'
	match = re.match(pattern, url)
	if match:
	owner, repo, pr_number = match.groups()
	return owner, repo, pr_number
	return None, None, None

	def get_pr_details(pr_url, max_files=25, file_types=None, auth=None):
	"""
	Get details of a GitHub Pull Request including changed files and their contents.
	Returns a dictionary with PR metadata and changes.

	Args:
	pr_url: URL of the GitHub PR
	max_files: Maximum number of files to fetch (default: 25)
	file_types: List of file extensions to include (default: None = all code files)
	auth: Optional tuple of (username, token) for authentication
	"""
	owner, repo, pr_number = parse_github_pr_url(pr_url)
	if not owner or not repo or not pr_number:
	return {"error": "Invalid GitHub PR URL format"}

	try:
	# Prepare headers for authentication
	headers = {}
	if auth and len(auth) == 2:
	username, token = auth
	auth_header = base64.b64encode(f"{username}:{token}".encode()).decode()
	headers["Authorization"] = f"Basic {auth_header}"

	# Fetch PR information
	api_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}"
	response = requests.get(api_url, headers=headers)
	response.raise_for_status()

	pr_data = response.json()

	# Get PR metadata
	pr_details = {
	"title": pr_data.get("title", ""),
	"description": pr_data.get("body", ""),
	"user": pr_data.get("user", {}).get("login", ""),
	"state": pr_data.get("state", ""),
	"created_at": pr_data.get("created_at", ""),
	"updated_at": pr_data.get("updated_at", ""),
	"target_branch": pr_data.get("base", {}).get("ref", ""),
	"source_branch": pr_data.get("head", {}).get("ref", ""),
	"changed_files": [],
	"total_file_count": pr_data.get("changed_files", 0)
	}

	# Default file types to include if not specified
	if file_types is None:
	file_types = ['.py', '.js', '.html', '.css', '.md', '.java', '.ts', '.jsx',
	'.tsx', '.go', '.c', '.cpp', '.h', '.hpp', '.json', '.yml',
	'.yaml', '.sh', '.txt', '.sql']

	# Fetch PR changed files with pagination
	page = 1

	while True:
	files_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}/files?per_page=100&page={page}"
	files_response = requests.get(files_url, headers=headers)
	files_response.raise_for_status()

	files_data = files_response.json()

	# If no more files, break the loop
	if not files_data:
	break

	# Process each file in this page
	for file_data in files_data:
	filename = file_data.get("filename", "")

	# Skip binary files and non-code files
	file_ext = os.path.splitext(filename)[1].lower()
	if file_types and file_ext not in file_types:
	continue

	file_info = {
	"filename": filename,
	"status": file_data.get("status", ""), # added, modified, removed
	"additions": file_data.get("additions", 0),
	"deletions": file_data.get("deletions", 0),
	"patch": file_data.get("patch", "")
	}

	# Add file content if it exists in the PR
	if file_data.get("status") != "removed":
	try:
	file_content_url = f"https://raw.githubusercontent.com/{owner}/{repo}/{pr_data['head']['sha']}/{filename}"
	content_response = requests.get(file_content_url, headers=headers)

	if content_response.status_code == 200:
	file_info["content"] = content_response.text
	except Exception as e:
	file_info["content_error"] = str(e)

	pr_details["changed_files"].append(file_info)

	# Stop when we reach the maximum number of files
	if len(pr_details["changed_files"]) >= max_files:
	break

	# If we've reached max files or there are no more pages, break
	if len(pr_details["changed_files"]) >= max_files or len(files_data) < 100:
	break

	# Move to next page
	page += 1

	return pr_details

	except Exception as e:
	return {"error": f"Error fetching PR details: {str(e)}"}

	def get_target_branch_code(pr_url, max_files=25, file_types=None, auth=None):
	"""
	Get the code from the target branch of a PR.
	Returns a dictionary of filenames and their content from the target branch.

	Args:
	pr_url: URL of the GitHub PR
	max_files: Maximum number of files to fetch (default: 25)
	file_types: List of file extensions to include (default: None = all code files)
	auth: Optional tuple of (username, token) for authentication
	"""
	owner, repo, pr_number = parse_github_pr_url(pr_url)
	if not owner or not repo or not pr_number:
	return {"error": "Invalid GitHub PR URL format"}

	try:
	# Prepare headers for authentication
	headers = {}
	if auth and len(auth) == 2:
	username, token = auth
	auth_header = base64.b64encode(f"{username}:{token}".encode()).decode()
	headers["Authorization"] = f"Basic {auth_header}"

	# First get the PR to find the target branch name
	api_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}"
	response = requests.get(api_url, headers=headers)
	response.raise_for_status()

	pr_data = response.json()
	target_branch = pr_data.get("base", {}).get("ref", "main") # Default to main if not found

	# Default file types to include if not specified
	if file_types is None:
	file_types = ['.py', '.js', '.html', '.css', '.md', '.java', '.ts', '.jsx',
	'.tsx', '.go', '.c', '.cpp', '.h', '.hpp', '.json', '.yml',
	'.yaml', '.sh', '.txt', '.sql']

	# Get files that were changed in the PR with pagination
	page = 1
	target_branch_code = {}

	while True:
	files_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}/files?per_page=100&page={page}"
	files_response = requests.get(files_url, headers=headers)
	files_response.raise_for_status()

	files_data = files_response.json()

	# If no more files, break the loop
	if not files_data:
	break

	# Get the changed filenames from this page
	for file_data in files_data:
	filename = file_data.get("filename")

	# Skip if filename is None or non-matching extension
	if not filename:
	continue

	file_ext = os.path.splitext(filename)[1].lower()
	if file_types and file_ext not in file_types:
	continue

	try:
	# Get file content from target branch
	file_url = f"https://raw.githubusercontent.com/{owner}/{repo}/{target_branch}/{filename}"
	file_response = requests.get(file_url, headers=headers)

	if file_response.status_code == 200:
	target_branch_code[filename] = file_response.text
	except Exception as e:
	print(f"Error fetching {filename} from target branch: {str(e)}")

	# Stop when we reach the maximum number of files
	if len(target_branch_code) >= max_files:
	break

	# If we've reached max files or there are no more pages, break
	if len(target_branch_code) >= max_files or len(files_data) < 100:
	break

	# Move to next page
	page += 1

	return target_branch_code

	except Exception as e:
	return {"error": f"Error fetching target branch code: {str(e)}"}

	def verify_github_credentials(username, token):
	"""
	Verify GitHub credentials by making a test API call.
	Returns True if credentials are valid, False otherwise.

	Args:
	username: GitHub username
	token: GitHub personal access token
	"""
	try:
	# Create authentication header
	auth_header = base64.b64encode(f"{username}:{token}".encode()).decode()
	headers = {"Authorization": f"Basic {auth_header}"}

	# Make a test API call to get user information
	response = requests.get("https://api.github.com/user", headers=headers)

	# Return True if the request was successful (status code 200)
	return response.status_code == 200
	except Exception as e:
	print(f"Error verifying GitHub credentials: {str(e)}")
	return False