Spaces:

zamal
/

DeepGit-lite

Running on Zero

DeepGit-lite / main.py

zamalali

Refactor DeepGit Lite to load environment variables, update API integration, and enhance user feedback

777083e 2 months ago

11.5 kB

	import os
	import base64
	import requests
	import numpy as np
	import faiss
	import re
	from sentence_transformers import SentenceTransformer
	from dotenv import load_dotenv
	from pathlib import Path
	from langchain_groq import ChatGroq
	from langchain_core.prompts import ChatPromptTemplate

	# Optionally import BM25 for sparse retrieval.
	try:
	from rank_bm25 import BM25Okapi
	except ImportError:
	BM25Okapi = None

	# ---------------------------
	# Environment Setup
	# ---------------------------
	load_dotenv()

	# Setup a persistent session for GitHub API requests
	session = requests.Session()
	session.headers.update({
	"Authorization": f"token {os.getenv('GITHUB_API_KEY')}",
	"Accept": "application/vnd.github.v3+json"
	})

	# ---------------------------
	# Langchain Groq Setup
	# ---------------------------
	llm = ChatGroq(
	model="deepseek-r1-distill-llama-70b",
	temperature=0.3,
	max_tokens=512,
	max_retries=3,
	)
	prompt = ChatPromptTemplate.from_messages([
	("system",
	"""You are a GitHub search optimization expert.

	Your job is to:
	1. Read a user's query about tools, research, or tasks.
	2. Detect if the query mentions a specific programming language other than Python (for example, JavaScript or JS). If so, record that language as the target language.
	3. Think iteratively and generate your internal chain-of-thought enclosed in <think> ... </think> tags.
	4. After your internal reasoning, output up to five GitHub-style search tags or library names that maximize repository discovery.
	Use as many tags as necessary based on the query's complexity, but never more than five.
	5. If you detected a non-Python target language, append an additional tag at the end in the format target-[language] (e.g., target-javascript).
	If no specific language is mentioned, do not include any target tag.

	Output Format:
	tag1:tag2[:tag3[:tag4[:tag5[:target-language]]]]

	Rules:
	- Use lowercase and hyphenated keywords (e.g., image-augmentation, chain-of-thought).
	- Use terms commonly found in GitHub repo names, topics, or descriptions.
	- Avoid generic terms like "python", "ai", "tool", "project".
	- Do NOT use full phrases or vague words like "no-code", "framework", or "approach".
	- Prefer real tools, popular methods, or dataset names when mentioned.
	- If your output does not strictly match the required format, correct it after your internal reasoning.
	- Choose high-signal keywords to ensure the search yields the most relevant GitHub repositories.

	Output must be ONLY the search tags separated by colons. Do not include any extra text, bullet points, or explanations.
	"""),
	("human", "{query}")
	])
	chain = prompt \| llm

	def parse_search_tags(response) -> str:
	"""
	Removes internal chain-of-thought (enclosed in <think> tags) and returns only the final search tags.
	"""
	response_str = str(response)
	if "<think>" in response_str and "</think>" in response_str:
	end_index = response_str.index("</think>") + len("</think>")
	tags = response_str[end_index:].strip()
	return tags
	else:
	return response_str.strip()

	def valid_tags(tags: str) -> bool:
	"""
	Validates that the output is one to six colon-separated tokens composed of lowercase letters, numbers, and hyphens.
	"""
	pattern = r'^[a-z0-9-]+(?::[a-z0-9-]+){0,5}$'
	return re.match(pattern, tags) is not None

	def iterative_convert_to_search_tags(query: str, max_iterations: int = 2) -> str:
	print(f"\n🧠 [iterative_convert_to_search_tags] Input Query: {query}")
	refined_query = query
	tags_output = ""
	for iteration in range(max_iterations):
	print(f"\n🔄 Iteration {iteration+1}")
	response = chain.invoke({"query": refined_query})
	full_output = response.content.strip()
	tags_output = parse_search_tags(full_output)
	print(f"Output Tags: {tags_output}")
	if valid_tags(tags_output):
	print("✅ Valid tags format detected.")
	return tags_output
	else:
	print("⚠️ Invalid tags format. Requesting refinement...")
	refined_query = f"{query}\nPlease refine your answer so that the output strictly matches the format: tag1:tag2[:tag3[:tag4[:tag5[:target-language]]]]."
	print("Final output (may be invalid):", tags_output)
	return tags_output

	# ---------------------------
	# GitHub API Helper Functions
	# ---------------------------
	def fetch_readme_content(repo_full_name):
	readme_url = f"https://api.github.com/repos/{repo_full_name}/readme"
	response = session.get(readme_url)
	if response.status_code == 200:
	readme_data = response.json()
	try:
	return base64.b64decode(readme_data.get('content', '')).decode('utf-8', errors='replace')
	except Exception:
	return ""
	return ""

	def fetch_github_repositories(query, max_results=10):
	"""
	Searches GitHub repositories using the provided query and retrieves key information.
	"""
	url = "https://api.github.com/search/repositories"
	params = {
	"q": query,
	"per_page": max_results
	}
	response = session.get(url, params=params)
	if response.status_code != 200:
	print(f"Error {response.status_code}: {response.json().get('message')}")
	return []
	repo_list = []
	for repo in response.json().get('items', []):
	repo_link = repo.get('html_url')
	description = repo.get('description') or ""
	readme_content = fetch_readme_content(repo.get('full_name'))
	# Combine description and README for a richer document context.
	combined_text = (description + "\n" + readme_content).strip()
	repo_list.append({
	"title": repo.get('name', 'No title available'),
	"link": repo_link,
	"combined_text": combined_text
	})
	return repo_list

	# ---------------------------
	# Initialize SentenceTransformer Model
	# ---------------------------
	model = SentenceTransformer('all-MiniLM-L6-v2')

	def robust_min_max_norm(scores):
	"""
	Performs min-max normalization while avoiding division by zero.
	"""
	min_val = scores.min()
	max_val = scores.max()
	if max_val - min_val < 1e-10:
	return np.ones_like(scores)
	return (scores - min_val) / (max_val - min_val)

	# ---------------------------
	# Main Function: Repository Ranking with Hybrid Retrieval
	# ---------------------------
	def run_repository_ranking(query: str) -> str:
	"""
	Converts the user query into search tags, runs multiple GitHub queries (individual and combined),
	deduplicates results, and applies hybrid dense (FAISS) and sparse (BM25) ranking.
	"""
	# Step 1: Generate search tags from the query.
	search_tags = iterative_convert_to_search_tags(query)
	tag_list = [tag.strip() for tag in search_tags.split(":") if tag.strip()]

	# Step 2: Handle target language extraction.
	target_lang = None
	if any(tag.startswith("target-") for tag in tag_list):
	target_tag = next(tag for tag in tag_list if tag.startswith("target-"))
	target_lang = target_tag.replace("target-", "")
	lang_query = f"language:{target_lang}"
	tag_list = [tag for tag in tag_list if not tag.startswith("target-")]
	else:
	lang_query = "language:python"

	# Step 3: Build advanced search qualifiers.
	advanced_qualifier = "in:name,description,readme"
	all_repositories = []

	# Loop over individual tags.
	for tag in tag_list:
	github_query = f"{tag} {advanced_qualifier} {lang_query}"
	print("GitHub Query:", github_query)
	repos = fetch_github_repositories(github_query, max_results=15)
	all_repositories.extend(repos)

	# Also perform a combined query using OR logic for higher recall.
	combined_query = " OR ".join(tag_list)
	combined_query = f"({combined_query}) {advanced_qualifier} {lang_query}"
	print("Combined GitHub Query:", combined_query)
	repos = fetch_github_repositories(combined_query, max_results=15)
	all_repositories.extend(repos)

	# Deduplicate repositories using the repo link.
	unique_repositories = {}
	for repo in all_repositories:
	if repo["link"] not in unique_repositories:
	unique_repositories[repo["link"]] = repo
	else:
	# Merge content if the repository appears in multiple queries.
	existing_text = unique_repositories[repo["link"]]["combined_text"]
	unique_repositories[repo["link"]]["combined_text"] = existing_text + "\n" + repo["combined_text"]
	repositories = list(unique_repositories.values())

	if not repositories:
	return "No repositories found for your query."

	# Step 4: Prepare documents by using the combined text (description + README).
	docs = [repo.get("combined_text", "") for repo in repositories]

	# Step 5: Compute dense embeddings and build the FAISS index.
	doc_embeddings = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16)
	if doc_embeddings.ndim == 1:
	doc_embeddings = doc_embeddings.reshape(1, -1)
	norms = np.linalg.norm(doc_embeddings, axis=1, keepdims=True)
	norm_doc_embeddings = doc_embeddings / (norms + 1e-10)

	query_embedding = model.encode(query, convert_to_numpy=True)
	if query_embedding.ndim == 1:
	query_embedding = query_embedding.reshape(1, -1)
	norm_query_embedding = query_embedding / (np.linalg.norm(query_embedding) + 1e-10)

	dim = norm_doc_embeddings.shape[1]
	index = faiss.IndexFlatIP(dim)
	index.add(norm_doc_embeddings)
	k = norm_doc_embeddings.shape[0]
	D, I = index.search(norm_query_embedding, k)
	dense_scores = D.squeeze()
	norm_dense_scores = robust_min_max_norm(dense_scores)

	# Step 6: Compute BM25 scores with improved tokenization.
	if BM25Okapi is not None:
	tokenized_docs = [re.findall(r'\w+', doc.lower()) for doc in docs]
	bm25 = BM25Okapi(tokenized_docs)
	query_tokens = re.findall(r'\w+', query.lower())
	bm25_scores = np.array(bm25.get_scores(query_tokens))
	norm_bm25_scores = robust_min_max_norm(bm25_scores)
	else:
	norm_bm25_scores = np.zeros_like(norm_dense_scores)

	# Step 7: Combine scores (with denser retrieval given higher weight).
	alpha = 0.8 # Weight for dense retrieval
	combined_scores = alpha * norm_dense_scores + (1 - alpha) * norm_bm25_scores

	for idx, repo in enumerate(repositories):
	repo["combined_score"] = float(combined_scores[idx])

	# Step 8: Rank repositories and format output.
	ranked_repositories = sorted(repositories, key=lambda x: x.get("combined_score", 0), reverse=True)

	output = "\n=== Ranked Repositories ===\n"
	for rank, repo in enumerate(ranked_repositories, 1):
	output += f"Final Rank: {rank}\n"
	output += f"Title: {repo['title']}\n"
	output += f"Link: {repo['link']}\n"
	output += f"Combined Score: {repo.get('combined_score', 0):.4f}\n"
	snippet = repo['combined_text'][:300].replace('\n', ' ')
	output += f"Snippet: {snippet}...\n"
	output += '-' * 80 + "\n"
	output += "\n=== End of Results ==="
	return output

	# ---------------------------
	# Main Entry Point for Testing
	# ---------------------------
	if __name__ == "__main__":
	test_query = "I am looking for repositories for data augmentation pipelines for fine-tuning LLMs"
	result = run_repository_ranking(test_query)
	print(result)