Spaces:
Running
on
Zero
Running
on
Zero
zamalali
Refactor DeepGit Lite to load environment variables, update API integration, and enhance user feedback
777083e
import os | |
import base64 | |
import requests | |
import numpy as np | |
import faiss | |
import re | |
from sentence_transformers import SentenceTransformer | |
from dotenv import load_dotenv | |
from pathlib import Path | |
from langchain_groq import ChatGroq | |
from langchain_core.prompts import ChatPromptTemplate | |
# Optionally import BM25 for sparse retrieval. | |
try: | |
from rank_bm25 import BM25Okapi | |
except ImportError: | |
BM25Okapi = None | |
# --------------------------- | |
# Environment Setup | |
# --------------------------- | |
load_dotenv() | |
# Setup a persistent session for GitHub API requests | |
session = requests.Session() | |
session.headers.update({ | |
"Authorization": f"token {os.getenv('GITHUB_API_KEY')}", | |
"Accept": "application/vnd.github.v3+json" | |
}) | |
# --------------------------- | |
# Langchain Groq Setup | |
# --------------------------- | |
llm = ChatGroq( | |
model="deepseek-r1-distill-llama-70b", | |
temperature=0.3, | |
max_tokens=512, | |
max_retries=3, | |
) | |
prompt = ChatPromptTemplate.from_messages([ | |
("system", | |
"""You are a GitHub search optimization expert. | |
Your job is to: | |
1. Read a user's query about tools, research, or tasks. | |
2. Detect if the query mentions a specific programming language other than Python (for example, JavaScript or JS). If so, record that language as the target language. | |
3. Think iteratively and generate your internal chain-of-thought enclosed in <think> ... </think> tags. | |
4. After your internal reasoning, output up to five GitHub-style search tags or library names that maximize repository discovery. | |
Use as many tags as necessary based on the query's complexity, but never more than five. | |
5. If you detected a non-Python target language, append an additional tag at the end in the format target-[language] (e.g., target-javascript). | |
If no specific language is mentioned, do not include any target tag. | |
Output Format: | |
tag1:tag2[:tag3[:tag4[:tag5[:target-language]]]] | |
Rules: | |
- Use lowercase and hyphenated keywords (e.g., image-augmentation, chain-of-thought). | |
- Use terms commonly found in GitHub repo names, topics, or descriptions. | |
- Avoid generic terms like "python", "ai", "tool", "project". | |
- Do NOT use full phrases or vague words like "no-code", "framework", or "approach". | |
- Prefer real tools, popular methods, or dataset names when mentioned. | |
- If your output does not strictly match the required format, correct it after your internal reasoning. | |
- Choose high-signal keywords to ensure the search yields the most relevant GitHub repositories. | |
Output must be ONLY the search tags separated by colons. Do not include any extra text, bullet points, or explanations. | |
"""), | |
("human", "{query}") | |
]) | |
chain = prompt | llm | |
def parse_search_tags(response) -> str: | |
""" | |
Removes internal chain-of-thought (enclosed in <think> tags) and returns only the final search tags. | |
""" | |
response_str = str(response) | |
if "<think>" in response_str and "</think>" in response_str: | |
end_index = response_str.index("</think>") + len("</think>") | |
tags = response_str[end_index:].strip() | |
return tags | |
else: | |
return response_str.strip() | |
def valid_tags(tags: str) -> bool: | |
""" | |
Validates that the output is one to six colon-separated tokens composed of lowercase letters, numbers, and hyphens. | |
""" | |
pattern = r'^[a-z0-9-]+(?::[a-z0-9-]+){0,5}$' | |
return re.match(pattern, tags) is not None | |
def iterative_convert_to_search_tags(query: str, max_iterations: int = 2) -> str: | |
print(f"\n🧠 [iterative_convert_to_search_tags] Input Query: {query}") | |
refined_query = query | |
tags_output = "" | |
for iteration in range(max_iterations): | |
print(f"\n🔄 Iteration {iteration+1}") | |
response = chain.invoke({"query": refined_query}) | |
full_output = response.content.strip() | |
tags_output = parse_search_tags(full_output) | |
print(f"Output Tags: {tags_output}") | |
if valid_tags(tags_output): | |
print("✅ Valid tags format detected.") | |
return tags_output | |
else: | |
print("⚠️ Invalid tags format. Requesting refinement...") | |
refined_query = f"{query}\nPlease refine your answer so that the output strictly matches the format: tag1:tag2[:tag3[:tag4[:tag5[:target-language]]]]." | |
print("Final output (may be invalid):", tags_output) | |
return tags_output | |
# --------------------------- | |
# GitHub API Helper Functions | |
# --------------------------- | |
def fetch_readme_content(repo_full_name): | |
readme_url = f"https://api.github.com/repos/{repo_full_name}/readme" | |
response = session.get(readme_url) | |
if response.status_code == 200: | |
readme_data = response.json() | |
try: | |
return base64.b64decode(readme_data.get('content', '')).decode('utf-8', errors='replace') | |
except Exception: | |
return "" | |
return "" | |
def fetch_github_repositories(query, max_results=10): | |
""" | |
Searches GitHub repositories using the provided query and retrieves key information. | |
""" | |
url = "https://api.github.com/search/repositories" | |
params = { | |
"q": query, | |
"per_page": max_results | |
} | |
response = session.get(url, params=params) | |
if response.status_code != 200: | |
print(f"Error {response.status_code}: {response.json().get('message')}") | |
return [] | |
repo_list = [] | |
for repo in response.json().get('items', []): | |
repo_link = repo.get('html_url') | |
description = repo.get('description') or "" | |
readme_content = fetch_readme_content(repo.get('full_name')) | |
# Combine description and README for a richer document context. | |
combined_text = (description + "\n" + readme_content).strip() | |
repo_list.append({ | |
"title": repo.get('name', 'No title available'), | |
"link": repo_link, | |
"combined_text": combined_text | |
}) | |
return repo_list | |
# --------------------------- | |
# Initialize SentenceTransformer Model | |
# --------------------------- | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
def robust_min_max_norm(scores): | |
""" | |
Performs min-max normalization while avoiding division by zero. | |
""" | |
min_val = scores.min() | |
max_val = scores.max() | |
if max_val - min_val < 1e-10: | |
return np.ones_like(scores) | |
return (scores - min_val) / (max_val - min_val) | |
# --------------------------- | |
# Main Function: Repository Ranking with Hybrid Retrieval | |
# --------------------------- | |
def run_repository_ranking(query: str) -> str: | |
""" | |
Converts the user query into search tags, runs multiple GitHub queries (individual and combined), | |
deduplicates results, and applies hybrid dense (FAISS) and sparse (BM25) ranking. | |
""" | |
# Step 1: Generate search tags from the query. | |
search_tags = iterative_convert_to_search_tags(query) | |
tag_list = [tag.strip() for tag in search_tags.split(":") if tag.strip()] | |
# Step 2: Handle target language extraction. | |
target_lang = None | |
if any(tag.startswith("target-") for tag in tag_list): | |
target_tag = next(tag for tag in tag_list if tag.startswith("target-")) | |
target_lang = target_tag.replace("target-", "") | |
lang_query = f"language:{target_lang}" | |
tag_list = [tag for tag in tag_list if not tag.startswith("target-")] | |
else: | |
lang_query = "language:python" | |
# Step 3: Build advanced search qualifiers. | |
advanced_qualifier = "in:name,description,readme" | |
all_repositories = [] | |
# Loop over individual tags. | |
for tag in tag_list: | |
github_query = f"{tag} {advanced_qualifier} {lang_query}" | |
print("GitHub Query:", github_query) | |
repos = fetch_github_repositories(github_query, max_results=15) | |
all_repositories.extend(repos) | |
# Also perform a combined query using OR logic for higher recall. | |
combined_query = " OR ".join(tag_list) | |
combined_query = f"({combined_query}) {advanced_qualifier} {lang_query}" | |
print("Combined GitHub Query:", combined_query) | |
repos = fetch_github_repositories(combined_query, max_results=15) | |
all_repositories.extend(repos) | |
# Deduplicate repositories using the repo link. | |
unique_repositories = {} | |
for repo in all_repositories: | |
if repo["link"] not in unique_repositories: | |
unique_repositories[repo["link"]] = repo | |
else: | |
# Merge content if the repository appears in multiple queries. | |
existing_text = unique_repositories[repo["link"]]["combined_text"] | |
unique_repositories[repo["link"]]["combined_text"] = existing_text + "\n" + repo["combined_text"] | |
repositories = list(unique_repositories.values()) | |
if not repositories: | |
return "No repositories found for your query." | |
# Step 4: Prepare documents by using the combined text (description + README). | |
docs = [repo.get("combined_text", "") for repo in repositories] | |
# Step 5: Compute dense embeddings and build the FAISS index. | |
doc_embeddings = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16) | |
if doc_embeddings.ndim == 1: | |
doc_embeddings = doc_embeddings.reshape(1, -1) | |
norms = np.linalg.norm(doc_embeddings, axis=1, keepdims=True) | |
norm_doc_embeddings = doc_embeddings / (norms + 1e-10) | |
query_embedding = model.encode(query, convert_to_numpy=True) | |
if query_embedding.ndim == 1: | |
query_embedding = query_embedding.reshape(1, -1) | |
norm_query_embedding = query_embedding / (np.linalg.norm(query_embedding) + 1e-10) | |
dim = norm_doc_embeddings.shape[1] | |
index = faiss.IndexFlatIP(dim) | |
index.add(norm_doc_embeddings) | |
k = norm_doc_embeddings.shape[0] | |
D, I = index.search(norm_query_embedding, k) | |
dense_scores = D.squeeze() | |
norm_dense_scores = robust_min_max_norm(dense_scores) | |
# Step 6: Compute BM25 scores with improved tokenization. | |
if BM25Okapi is not None: | |
tokenized_docs = [re.findall(r'\w+', doc.lower()) for doc in docs] | |
bm25 = BM25Okapi(tokenized_docs) | |
query_tokens = re.findall(r'\w+', query.lower()) | |
bm25_scores = np.array(bm25.get_scores(query_tokens)) | |
norm_bm25_scores = robust_min_max_norm(bm25_scores) | |
else: | |
norm_bm25_scores = np.zeros_like(norm_dense_scores) | |
# Step 7: Combine scores (with denser retrieval given higher weight). | |
alpha = 0.8 # Weight for dense retrieval | |
combined_scores = alpha * norm_dense_scores + (1 - alpha) * norm_bm25_scores | |
for idx, repo in enumerate(repositories): | |
repo["combined_score"] = float(combined_scores[idx]) | |
# Step 8: Rank repositories and format output. | |
ranked_repositories = sorted(repositories, key=lambda x: x.get("combined_score", 0), reverse=True) | |
output = "\n=== Ranked Repositories ===\n" | |
for rank, repo in enumerate(ranked_repositories, 1): | |
output += f"Final Rank: {rank}\n" | |
output += f"Title: {repo['title']}\n" | |
output += f"Link: {repo['link']}\n" | |
output += f"Combined Score: {repo.get('combined_score', 0):.4f}\n" | |
snippet = repo['combined_text'][:300].replace('\n', ' ') | |
output += f"Snippet: {snippet}...\n" | |
output += '-' * 80 + "\n" | |
output += "\n=== End of Results ===" | |
return output | |
# --------------------------- | |
# Main Entry Point for Testing | |
# --------------------------- | |
if __name__ == "__main__": | |
test_query = "I am looking for repositories for data augmentation pipelines for fine-tuning LLMs" | |
result = run_repository_ranking(test_query) | |
print(result) | |