Spaces:

zamal
/

DeepGit-lite

Running on Zero

App Files Files Community

zamal commited on 25 days ago

Commit

0f63890

verified ·

1 Parent(s): f8f258d

Delete src/deepgit_lite.py

Browse files

Files changed (1) hide show

src/deepgit_lite.py +0 -327

src/deepgit_lite.py DELETED Viewed

@@ -1,327 +0,0 @@
-import os
-import base64
-import requests
-import numpy as np
-import datetime
-from sentence_transformers import SentenceTransformer
-import faiss
-import math
-import logging
-from dotenv import load_dotenv
-from pathlib import Path
-from langchain_groq import ChatGroq
-from langchain_core.prompts import ChatPromptTemplate
-import re
-import getpass
-# ---------------------------
-# Environment and .env Setup
-# ---------------------------
-dotenv_path = Path(__file__).resolve().parents[1] / ".env"
-if dotenv_path.exists():
-    load_dotenv(dotenv_path=dotenv_path)
-if "GITHUB_API_KEY" not in os.environ:
-    raise EnvironmentError("GITHUB_API_KEY not set in environment. Please set it as an environment variable.")
-# Optionally, silence bitsandbytes warnings if desired.
-os.environ["BITSANDBYTES_NOWARN"] = "1"
-# ---------------------------
-# Logging Setup
-# ---------------------------
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
-logger = logging.getLogger(__name__)
-# ---------------------------
-# ChatGroq Integration Setup (for query conversion and final justification)
-# ---------------------------
-llm_groq = ChatGroq(
-    model="deepseek-r1-distill-llama-70b",
-    temperature=0.2,
-    max_tokens=800,
-    timeout=15,
-    max_retries=2
-)
-# --- Query Conversion Functions ---
-prompt = ChatPromptTemplate.from_messages([
-    ("system",
-     """You are a GitHub search optimization expert.
-Your job is to:
-1. Read a user's query about tools, research, or tasks.
-2. Detect if the query mentions a specific programming language other than Python (for example, JavaScript or JS). If so, record that language as the target language.
-3. Think iteratively and generate your internal chain-of-thought enclosed in <think> ... </think> tags.
-4. After your internal reasoning, output up to five GitHub-style search tags or library names that maximize repository discovery.
-   Use as many tags as necessary based on the query's complexity, but never more than five.
-5. If you detected a non-Python target language, append an additional tag at the end in the format target-[language] (e.g., target-javascript).
-   If no specific language is mentioned, do not include any target tag.
-Output Format:
-tag1:tag2[:tag3[:tag4[:tag5[:target-language]]]]
-Rules:
-- Use lowercase and hyphenated keywords (e.g., image-augmentation, chain-of-thought).
-- Use terms commonly found in GitHub repo names, topics, or descriptions.
-- Avoid generic terms like "python", "ai", "tool", "project".
-- Do NOT use full phrases or vague words like "no-code", "framework", or "approach".
-- Prefer real tools, popular methods, or dataset names when mentioned.
-- If your output does not strictly match the required format, correct it after your internal reasoning.
-- Choose high-signal keywords to ensure the search yields the most relevant GitHub repositories.
-Output must be ONLY the search tags separated by colons. Do not include any extra text, bullet points, or explanations.
-"""),
-    ("human", "{query}")
-])
-chain = prompt | llm_groq
-def parse_search_tags(response: str) -> str:
-    """
-    Removes any internal commentary enclosed in <think> ... </think> tags using regex,
-    and returns only the final searchable tags.
-    """
-    cleaned = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL).strip()
-    return cleaned
-def valid_tags(tags: str) -> bool:
-    """
-    Validates that the output is one to six colon-separated tokens composed of lowercase letters, numbers, and hyphens.
-    """
-    pattern = r'^[a-z0-9-]+(?::[a-z0-9-]+){0,5}$'
-    return re.match(pattern, tags) is not None
-def iterative_convert_to_search_tags(query: str, max_iterations: int = 2) -> str:
-    print(f"\n🧠 [iterative_convert_to_search_tags] Input Query: {query}")
-    refined_query = query
-    tags_output = ""
-    for iteration in range(max_iterations):
-        print(f"\n🔄 Iteration {iteration+1}")
-        response = chain.invoke({"query": refined_query})
-        full_output = response.content.strip()
-        tags_output = parse_search_tags(full_output)
-        print(f"Output Tags: {tags_output}")
-        if valid_tags(tags_output):
-            print("✅ Valid tags format detected.")
-            return tags_output
-        else:
-            print("⚠️ Invalid tags format. Requesting refinement...")
-            refined_query = f"{query}\nPlease refine your answer so that the output strictly matches the format: tag1:tag2[:tag3[:tag4[:tag5[:target-language]]]]."
-    print("Final output (may be invalid):", tags_output)
-    # Fallback default tags if output is still invalid
-    fallback = "data-augmentation:llm-fine-tuning"
-    print(f"Using fallback search tags: {fallback}")
-    return fallback
-# --- Justification Function ---
-def justify_candidate(candidate, query):
-    prompt = f"""You are a highly knowledgeable AI research assistant. In one to two lines, explain why the repository titled "{candidate['title']}" is a good match for a query on "{query}". Mention key factors such as documentation quality and community validation if relevant.
-Repository Details:
-- Stars: {candidate['stars']}
-- Semantic Similarity: {candidate.get('semantic_similarity', 0):.4f}
-Provide a concise justification:"""
-    messages = [
-        ("system", "You are a highly knowledgeable AI research assistant that can succinctly justify repository matches."),
-        ("human", prompt)
-    ]
-    result = llm_groq.invoke(messages)
-    if hasattr(result, "content"):
-        return result.content
-    return str(result)
-# ---------------------------
-# GitHub API Helper Functions
-# ---------------------------
-def fetch_readme_content(repo_full_name, headers):
-    readme_url = f"https://api.github.com/repos/{repo_full_name}/readme"
-    response = requests.get(readme_url, headers=headers)
-    if response.status_code == 200:
-        readme_data = response.json()
-        return base64.b64decode(readme_data.get('content', '')).decode('utf-8')
-    return ""
-def fetch_file_content(download_url):
-    try:
-        response = requests.get(download_url)
-        if response.status_code == 200:
-            return response.text
-    except Exception as e:
-        logger.error(f"Error fetching file: {e}")
-    return ""
-def fetch_directory_markdown(repo_full_name, path, headers):
-    md_content = ""
-    url = f"https://api.github.com/repos/{repo_full_name}/contents/{path}"
-    response = requests.get(url, headers=headers)
-    if response.status_code == 200:
-        items = response.json()
-        for item in items:
-            if item["type"] == "file" and item["name"].lower().endswith(".md"):
-                content = fetch_file_content(item["download_url"])
-                md_content += f"\n\n# {item['name']}\n" + content
-    return md_content
-def fetch_repo_documentation(repo_full_name, headers):
-    doc_text = ""
-    readme = fetch_readme_content(repo_full_name, headers)
-    if readme:
-        doc_text += "# README\n" + readme
-    root_url = f"https://api.github.com/repos/{repo_full_name}/contents"
-    response = requests.get(root_url, headers=headers)
-    if response.status_code == 200:
-        items = response.json()
-        for item in items:
-            if item["type"] == "file" and item["name"].lower().endswith(".md"):
-                if item["name"].lower() != "readme.md":
-                    content = fetch_file_content(item["download_url"])
-                    doc_text += f"\n\n# {item['name']}\n" + content
-            elif item["type"] == "dir" and item["name"].lower() in ["docs", "documentation"]:
-                doc_text += f"\n\n# {item['name']} folder\n" + fetch_directory_markdown(repo_full_name, item["name"], headers)
-    return doc_text if doc_text.strip() else "No documentation available."
-def fetch_github_repositories(query, max_results=1000, per_page=100):
-    url = "https://api.github.com/search/repositories"
-    headers = {
-        "Authorization": f"token {os.getenv('GITHUB_API_KEY')}",
-        "Accept": "application/vnd.github.v3+json"
-    }
-    repositories = []
-    num_pages = max_results // per_page
-    for page in range(1, num_pages + 1):
-        params = {
-            "q": query,
-            "sort": "stars",
-            "order": "desc",
-            "per_page": per_page,
-            "page": page
-        }
-        response = requests.get(url, headers=headers, params=params)
-        if response.status_code != 200:
-            logger.error(f"Error {response.status_code}: {response.json().get('message')}")
-            break
-        items = response.json().get('items', [])
-        if not items:
-            break
-        for repo in items:
-            repo_link = repo['html_url']
-            full_name = repo.get('full_name', '')
-            doc_content = fetch_repo_documentation(full_name, headers)
-            star_count = repo.get('stargazers_count', 0)
-            repositories.append({
-                "title": repo.get('name', 'No title available'),
-                "link": repo_link,
-                "combined_doc": doc_content,
-                "stars": star_count,
-                "full_name": full_name,
-                "open_issues_count": repo.get('open_issues_count', 0)
-            })
-    logger.info(f"Fetched {len(repositories)} repositories from GitHub.")
-    return repositories
-# ---------------------------
-# Main Lite Workflow Function
-# ---------------------------
-def run_deepgit_lite(user_query):
-    # Stage 0: Query Conversion using iterative_convert_to_search_tags
-    logger.info("Converting query to searchable tags...")
-    original_query = user_query.strip()
-    search_tags = iterative_convert_to_search_tags(original_query)
-    logger.info(f"Search Tags: {search_tags}")
-    # Convert colon-separated tags into a space-separated query string.
-    tag_list = [tag.strip() for tag in search_tags.split(":") if tag.strip()]
-    github_query = " ".join(tag_list) + " language:python"
-    logger.info(f"Using GitHub query: {github_query}")
-    # Stage 1: Dense Retrieval with FAISS - Fetch repositories using the query.
-    logger.info("Fetching repositories from GitHub...")
-    repos = fetch_github_repositories(github_query)
-    if not repos:
-        logger.warning("No repositories found with converted query. Falling back to default query.")
-        fallback_query = "data augmentation language:python"
-        logger.info(f"Using fallback GitHub query: {fallback_query}")
-        repos = fetch_github_repositories(fallback_query)
-        if not repos:
-            logger.error("No repositories found with fallback query either.")
-            return "\nNo repositories found for your query. Please try a different query."
-    docs = [repo.get("combined_doc", "") for repo in repos]
-    logger.info(f"Encoding {len(docs)} documents for dense retrieval...")
-    sem_model = SentenceTransformer("all-mpnet-base-v2", device="cpu")
-    doc_embeddings = sem_model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16)
-    if doc_embeddings.ndim < 2 or doc_embeddings.shape[0] == 0:
-        logger.error("No document embeddings generated. Aborting dense retrieval.")
-        return "\nFailed to generate document embeddings. Please try again."
-    def normalize_embeddings(embeddings):
-        norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
-        return embeddings / (norms + 1e-10)
-    doc_embeddings = normalize_embeddings(doc_embeddings)
-    query_embedding = sem_model.encode(user_query, convert_to_numpy=True)
-    query_embedding = normalize_embeddings(np.expand_dims(query_embedding, axis=0))[0]
-    dim = doc_embeddings.shape[1]
-    index = faiss.IndexFlatIP(dim)
-    index.add(doc_embeddings)
-    k = min(100, doc_embeddings.shape[0])
-    D, I = index.search(np.expand_dims(query_embedding, axis=0), k)
-    for idx, score in zip(I[0], D[0]):
-        repos[idx]["semantic_similarity"] = score
-    ranked_by_semantic = sorted(repos, key=lambda x: x.get("semantic_similarity", 0), reverse=True)
-    logger.info(f"Stage 1 complete: {len(ranked_by_semantic)} candidates ranked by semantic similarity.")
-    # Stage 2: Filtering Low-Star Repositories
-    filtered_candidates = [repo for repo in ranked_by_semantic if repo["stars"] >= 50]
-    if not filtered_candidates:
-        filtered_candidates = ranked_by_semantic
-    logger.info(f"Stage 2 complete: {len(filtered_candidates)} candidates remain after filtering low-star repositories.")
-    # Stage 3: Combine Scores for Final Ranking (Using Semantic Similarity and Stars Only)
-    semantic_scores = [repo.get("semantic_similarity", 0) for repo in filtered_candidates]
-    star_scores = [math.log(repo.get("stars", 0) + 1) for repo in filtered_candidates]
-    min_sem, max_sem = min(semantic_scores), max(semantic_scores)
-    min_star, max_star = min(star_scores), max(star_scores)
-    def normalize(val, min_val, max_val):
-        if max_val - min_val == 0:
-            return 0.5
-        return (val - min_val) / (max_val - min_val)
-    for repo in filtered_candidates:
-        norm_sem = normalize(repo.get("semantic_similarity", 0), min_sem, max_sem)
-        norm_star = normalize(math.log(repo.get("stars", 0) + 1), min_star, max_star)
-        repo["final_score"] = 0.6 * norm_sem + 0.4 * norm_star
-    final_ranked = sorted(filtered_candidates, key=lambda x: x["final_score"], reverse=True)
-    logger.info(f"Stage 3 complete: Final ranking computed for {len(final_ranked)} candidates.")
-    # Stage 4: Final Justification using ChatGroq
-    justifications = {}
-    for repo in final_ranked[:10]:
-        justification = justify_candidate(repo, user_query)
-        justifications[repo['title']] = justification
-        logger.info(f"Justification for {repo['title']}: {justification}")
-    # Format final results into a text table.
-    result_text = "\n=== Final Ranked Repositories ===\n"
-    for rank, repo in enumerate(final_ranked[:10], 1):
-        result_text += f"Final Rank: {rank}\n"
-        result_text += f"Title: {repo['title']}\n"
-        result_text += f"Link: {repo['link']}\n"
-        result_text += f"Stars: {repo['stars']}\n"
-        result_text += f"Semantic Similarity: {repo.get('semantic_similarity', 0):.4f}\n"
-        result_text += f"Final Score: {repo.get('final_score', 0):.4f}\n"
-        result_text += f"Justification: {justifications.get(repo['title'], 'No justification available')}\n"
-        result_text += f"Combined Doc Snippet: {repo['combined_doc'][:200]}...\n"
-        result_text += '-' * 80 + "\n"
-    result_text += "\n=== End of Results ==="
-    return result_text
-# For debugging: if run directly, execute with an example query.
-if __name__ == "__main__":
-    test_query = "I am looking for repositories for data augmentation pipelines for fine-tuning LLMs"
-    print(run_deepgit_lite(test_query))