Spaces:
Running
on
Zero
Running
on
Zero
Delete src/deepgit_lite.py
Browse files- src/deepgit_lite.py +0 -327
src/deepgit_lite.py
DELETED
@@ -1,327 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import base64
|
3 |
-
import requests
|
4 |
-
import numpy as np
|
5 |
-
import datetime
|
6 |
-
from sentence_transformers import SentenceTransformer
|
7 |
-
import faiss
|
8 |
-
import math
|
9 |
-
import logging
|
10 |
-
from dotenv import load_dotenv
|
11 |
-
from pathlib import Path
|
12 |
-
from langchain_groq import ChatGroq
|
13 |
-
from langchain_core.prompts import ChatPromptTemplate
|
14 |
-
import re
|
15 |
-
import getpass
|
16 |
-
|
17 |
-
# ---------------------------
|
18 |
-
# Environment and .env Setup
|
19 |
-
# ---------------------------
|
20 |
-
dotenv_path = Path(__file__).resolve().parents[1] / ".env"
|
21 |
-
if dotenv_path.exists():
|
22 |
-
load_dotenv(dotenv_path=dotenv_path)
|
23 |
-
|
24 |
-
if "GITHUB_API_KEY" not in os.environ:
|
25 |
-
raise EnvironmentError("GITHUB_API_KEY not set in environment. Please set it as an environment variable.")
|
26 |
-
|
27 |
-
# Optionally, silence bitsandbytes warnings if desired.
|
28 |
-
os.environ["BITSANDBYTES_NOWARN"] = "1"
|
29 |
-
|
30 |
-
# ---------------------------
|
31 |
-
# Logging Setup
|
32 |
-
# ---------------------------
|
33 |
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
34 |
-
logger = logging.getLogger(__name__)
|
35 |
-
|
36 |
-
# ---------------------------
|
37 |
-
# ChatGroq Integration Setup (for query conversion and final justification)
|
38 |
-
# ---------------------------
|
39 |
-
llm_groq = ChatGroq(
|
40 |
-
model="deepseek-r1-distill-llama-70b",
|
41 |
-
temperature=0.2,
|
42 |
-
max_tokens=800,
|
43 |
-
timeout=15,
|
44 |
-
max_retries=2
|
45 |
-
)
|
46 |
-
|
47 |
-
# --- Query Conversion Functions ---
|
48 |
-
prompt = ChatPromptTemplate.from_messages([
|
49 |
-
("system",
|
50 |
-
"""You are a GitHub search optimization expert.
|
51 |
-
|
52 |
-
Your job is to:
|
53 |
-
1. Read a user's query about tools, research, or tasks.
|
54 |
-
2. Detect if the query mentions a specific programming language other than Python (for example, JavaScript or JS). If so, record that language as the target language.
|
55 |
-
3. Think iteratively and generate your internal chain-of-thought enclosed in <think> ... </think> tags.
|
56 |
-
4. After your internal reasoning, output up to five GitHub-style search tags or library names that maximize repository discovery.
|
57 |
-
Use as many tags as necessary based on the query's complexity, but never more than five.
|
58 |
-
5. If you detected a non-Python target language, append an additional tag at the end in the format target-[language] (e.g., target-javascript).
|
59 |
-
If no specific language is mentioned, do not include any target tag.
|
60 |
-
|
61 |
-
Output Format:
|
62 |
-
tag1:tag2[:tag3[:tag4[:tag5[:target-language]]]]
|
63 |
-
|
64 |
-
Rules:
|
65 |
-
- Use lowercase and hyphenated keywords (e.g., image-augmentation, chain-of-thought).
|
66 |
-
- Use terms commonly found in GitHub repo names, topics, or descriptions.
|
67 |
-
- Avoid generic terms like "python", "ai", "tool", "project".
|
68 |
-
- Do NOT use full phrases or vague words like "no-code", "framework", or "approach".
|
69 |
-
- Prefer real tools, popular methods, or dataset names when mentioned.
|
70 |
-
- If your output does not strictly match the required format, correct it after your internal reasoning.
|
71 |
-
- Choose high-signal keywords to ensure the search yields the most relevant GitHub repositories.
|
72 |
-
|
73 |
-
Output must be ONLY the search tags separated by colons. Do not include any extra text, bullet points, or explanations.
|
74 |
-
"""),
|
75 |
-
("human", "{query}")
|
76 |
-
])
|
77 |
-
chain = prompt | llm_groq
|
78 |
-
|
79 |
-
def parse_search_tags(response: str) -> str:
|
80 |
-
"""
|
81 |
-
Removes any internal commentary enclosed in <think> ... </think> tags using regex,
|
82 |
-
and returns only the final searchable tags.
|
83 |
-
"""
|
84 |
-
cleaned = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL).strip()
|
85 |
-
return cleaned
|
86 |
-
|
87 |
-
def valid_tags(tags: str) -> bool:
|
88 |
-
"""
|
89 |
-
Validates that the output is one to six colon-separated tokens composed of lowercase letters, numbers, and hyphens.
|
90 |
-
"""
|
91 |
-
pattern = r'^[a-z0-9-]+(?::[a-z0-9-]+){0,5}$'
|
92 |
-
return re.match(pattern, tags) is not None
|
93 |
-
|
94 |
-
def iterative_convert_to_search_tags(query: str, max_iterations: int = 2) -> str:
|
95 |
-
print(f"\n🧠 [iterative_convert_to_search_tags] Input Query: {query}")
|
96 |
-
refined_query = query
|
97 |
-
tags_output = ""
|
98 |
-
for iteration in range(max_iterations):
|
99 |
-
print(f"\n🔄 Iteration {iteration+1}")
|
100 |
-
response = chain.invoke({"query": refined_query})
|
101 |
-
full_output = response.content.strip()
|
102 |
-
tags_output = parse_search_tags(full_output)
|
103 |
-
print(f"Output Tags: {tags_output}")
|
104 |
-
if valid_tags(tags_output):
|
105 |
-
print("✅ Valid tags format detected.")
|
106 |
-
return tags_output
|
107 |
-
else:
|
108 |
-
print("⚠️ Invalid tags format. Requesting refinement...")
|
109 |
-
refined_query = f"{query}\nPlease refine your answer so that the output strictly matches the format: tag1:tag2[:tag3[:tag4[:tag5[:target-language]]]]."
|
110 |
-
print("Final output (may be invalid):", tags_output)
|
111 |
-
# Fallback default tags if output is still invalid
|
112 |
-
fallback = "data-augmentation:llm-fine-tuning"
|
113 |
-
print(f"Using fallback search tags: {fallback}")
|
114 |
-
return fallback
|
115 |
-
|
116 |
-
# --- Justification Function ---
|
117 |
-
def justify_candidate(candidate, query):
|
118 |
-
prompt = f"""You are a highly knowledgeable AI research assistant. In one to two lines, explain why the repository titled "{candidate['title']}" is a good match for a query on "{query}". Mention key factors such as documentation quality and community validation if relevant.
|
119 |
-
|
120 |
-
Repository Details:
|
121 |
-
- Stars: {candidate['stars']}
|
122 |
-
- Semantic Similarity: {candidate.get('semantic_similarity', 0):.4f}
|
123 |
-
|
124 |
-
Provide a concise justification:"""
|
125 |
-
messages = [
|
126 |
-
("system", "You are a highly knowledgeable AI research assistant that can succinctly justify repository matches."),
|
127 |
-
("human", prompt)
|
128 |
-
]
|
129 |
-
result = llm_groq.invoke(messages)
|
130 |
-
if hasattr(result, "content"):
|
131 |
-
return result.content
|
132 |
-
return str(result)
|
133 |
-
|
134 |
-
# ---------------------------
|
135 |
-
# GitHub API Helper Functions
|
136 |
-
# ---------------------------
|
137 |
-
def fetch_readme_content(repo_full_name, headers):
|
138 |
-
readme_url = f"https://api.github.com/repos/{repo_full_name}/readme"
|
139 |
-
response = requests.get(readme_url, headers=headers)
|
140 |
-
if response.status_code == 200:
|
141 |
-
readme_data = response.json()
|
142 |
-
return base64.b64decode(readme_data.get('content', '')).decode('utf-8')
|
143 |
-
return ""
|
144 |
-
|
145 |
-
def fetch_file_content(download_url):
|
146 |
-
try:
|
147 |
-
response = requests.get(download_url)
|
148 |
-
if response.status_code == 200:
|
149 |
-
return response.text
|
150 |
-
except Exception as e:
|
151 |
-
logger.error(f"Error fetching file: {e}")
|
152 |
-
return ""
|
153 |
-
|
154 |
-
def fetch_directory_markdown(repo_full_name, path, headers):
|
155 |
-
md_content = ""
|
156 |
-
url = f"https://api.github.com/repos/{repo_full_name}/contents/{path}"
|
157 |
-
response = requests.get(url, headers=headers)
|
158 |
-
if response.status_code == 200:
|
159 |
-
items = response.json()
|
160 |
-
for item in items:
|
161 |
-
if item["type"] == "file" and item["name"].lower().endswith(".md"):
|
162 |
-
content = fetch_file_content(item["download_url"])
|
163 |
-
md_content += f"\n\n# {item['name']}\n" + content
|
164 |
-
return md_content
|
165 |
-
|
166 |
-
def fetch_repo_documentation(repo_full_name, headers):
|
167 |
-
doc_text = ""
|
168 |
-
readme = fetch_readme_content(repo_full_name, headers)
|
169 |
-
if readme:
|
170 |
-
doc_text += "# README\n" + readme
|
171 |
-
root_url = f"https://api.github.com/repos/{repo_full_name}/contents"
|
172 |
-
response = requests.get(root_url, headers=headers)
|
173 |
-
if response.status_code == 200:
|
174 |
-
items = response.json()
|
175 |
-
for item in items:
|
176 |
-
if item["type"] == "file" and item["name"].lower().endswith(".md"):
|
177 |
-
if item["name"].lower() != "readme.md":
|
178 |
-
content = fetch_file_content(item["download_url"])
|
179 |
-
doc_text += f"\n\n# {item['name']}\n" + content
|
180 |
-
elif item["type"] == "dir" and item["name"].lower() in ["docs", "documentation"]:
|
181 |
-
doc_text += f"\n\n# {item['name']} folder\n" + fetch_directory_markdown(repo_full_name, item["name"], headers)
|
182 |
-
return doc_text if doc_text.strip() else "No documentation available."
|
183 |
-
|
184 |
-
def fetch_github_repositories(query, max_results=1000, per_page=100):
|
185 |
-
url = "https://api.github.com/search/repositories"
|
186 |
-
headers = {
|
187 |
-
"Authorization": f"token {os.getenv('GITHUB_API_KEY')}",
|
188 |
-
"Accept": "application/vnd.github.v3+json"
|
189 |
-
}
|
190 |
-
repositories = []
|
191 |
-
num_pages = max_results // per_page
|
192 |
-
for page in range(1, num_pages + 1):
|
193 |
-
params = {
|
194 |
-
"q": query,
|
195 |
-
"sort": "stars",
|
196 |
-
"order": "desc",
|
197 |
-
"per_page": per_page,
|
198 |
-
"page": page
|
199 |
-
}
|
200 |
-
response = requests.get(url, headers=headers, params=params)
|
201 |
-
if response.status_code != 200:
|
202 |
-
logger.error(f"Error {response.status_code}: {response.json().get('message')}")
|
203 |
-
break
|
204 |
-
items = response.json().get('items', [])
|
205 |
-
if not items:
|
206 |
-
break
|
207 |
-
for repo in items:
|
208 |
-
repo_link = repo['html_url']
|
209 |
-
full_name = repo.get('full_name', '')
|
210 |
-
doc_content = fetch_repo_documentation(full_name, headers)
|
211 |
-
star_count = repo.get('stargazers_count', 0)
|
212 |
-
repositories.append({
|
213 |
-
"title": repo.get('name', 'No title available'),
|
214 |
-
"link": repo_link,
|
215 |
-
"combined_doc": doc_content,
|
216 |
-
"stars": star_count,
|
217 |
-
"full_name": full_name,
|
218 |
-
"open_issues_count": repo.get('open_issues_count', 0)
|
219 |
-
})
|
220 |
-
logger.info(f"Fetched {len(repositories)} repositories from GitHub.")
|
221 |
-
return repositories
|
222 |
-
|
223 |
-
# ---------------------------
|
224 |
-
# Main Lite Workflow Function
|
225 |
-
# ---------------------------
|
226 |
-
def run_deepgit_lite(user_query):
|
227 |
-
# Stage 0: Query Conversion using iterative_convert_to_search_tags
|
228 |
-
logger.info("Converting query to searchable tags...")
|
229 |
-
original_query = user_query.strip()
|
230 |
-
search_tags = iterative_convert_to_search_tags(original_query)
|
231 |
-
logger.info(f"Search Tags: {search_tags}")
|
232 |
-
# Convert colon-separated tags into a space-separated query string.
|
233 |
-
tag_list = [tag.strip() for tag in search_tags.split(":") if tag.strip()]
|
234 |
-
github_query = " ".join(tag_list) + " language:python"
|
235 |
-
logger.info(f"Using GitHub query: {github_query}")
|
236 |
-
|
237 |
-
# Stage 1: Dense Retrieval with FAISS - Fetch repositories using the query.
|
238 |
-
logger.info("Fetching repositories from GitHub...")
|
239 |
-
repos = fetch_github_repositories(github_query)
|
240 |
-
if not repos:
|
241 |
-
logger.warning("No repositories found with converted query. Falling back to default query.")
|
242 |
-
fallback_query = "data augmentation language:python"
|
243 |
-
logger.info(f"Using fallback GitHub query: {fallback_query}")
|
244 |
-
repos = fetch_github_repositories(fallback_query)
|
245 |
-
if not repos:
|
246 |
-
logger.error("No repositories found with fallback query either.")
|
247 |
-
return "\nNo repositories found for your query. Please try a different query."
|
248 |
-
|
249 |
-
docs = [repo.get("combined_doc", "") for repo in repos]
|
250 |
-
logger.info(f"Encoding {len(docs)} documents for dense retrieval...")
|
251 |
-
sem_model = SentenceTransformer("all-mpnet-base-v2", device="cpu")
|
252 |
-
doc_embeddings = sem_model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16)
|
253 |
-
|
254 |
-
if doc_embeddings.ndim < 2 or doc_embeddings.shape[0] == 0:
|
255 |
-
logger.error("No document embeddings generated. Aborting dense retrieval.")
|
256 |
-
return "\nFailed to generate document embeddings. Please try again."
|
257 |
-
|
258 |
-
def normalize_embeddings(embeddings):
|
259 |
-
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
|
260 |
-
return embeddings / (norms + 1e-10)
|
261 |
-
|
262 |
-
doc_embeddings = normalize_embeddings(doc_embeddings)
|
263 |
-
query_embedding = sem_model.encode(user_query, convert_to_numpy=True)
|
264 |
-
query_embedding = normalize_embeddings(np.expand_dims(query_embedding, axis=0))[0]
|
265 |
-
dim = doc_embeddings.shape[1]
|
266 |
-
index = faiss.IndexFlatIP(dim)
|
267 |
-
index.add(doc_embeddings)
|
268 |
-
k = min(100, doc_embeddings.shape[0])
|
269 |
-
D, I = index.search(np.expand_dims(query_embedding, axis=0), k)
|
270 |
-
for idx, score in zip(I[0], D[0]):
|
271 |
-
repos[idx]["semantic_similarity"] = score
|
272 |
-
ranked_by_semantic = sorted(repos, key=lambda x: x.get("semantic_similarity", 0), reverse=True)
|
273 |
-
logger.info(f"Stage 1 complete: {len(ranked_by_semantic)} candidates ranked by semantic similarity.")
|
274 |
-
|
275 |
-
# Stage 2: Filtering Low-Star Repositories
|
276 |
-
filtered_candidates = [repo for repo in ranked_by_semantic if repo["stars"] >= 50]
|
277 |
-
if not filtered_candidates:
|
278 |
-
filtered_candidates = ranked_by_semantic
|
279 |
-
logger.info(f"Stage 2 complete: {len(filtered_candidates)} candidates remain after filtering low-star repositories.")
|
280 |
-
|
281 |
-
# Stage 3: Combine Scores for Final Ranking (Using Semantic Similarity and Stars Only)
|
282 |
-
semantic_scores = [repo.get("semantic_similarity", 0) for repo in filtered_candidates]
|
283 |
-
star_scores = [math.log(repo.get("stars", 0) + 1) for repo in filtered_candidates]
|
284 |
-
|
285 |
-
min_sem, max_sem = min(semantic_scores), max(semantic_scores)
|
286 |
-
min_star, max_star = min(star_scores), max(star_scores)
|
287 |
-
|
288 |
-
def normalize(val, min_val, max_val):
|
289 |
-
if max_val - min_val == 0:
|
290 |
-
return 0.5
|
291 |
-
return (val - min_val) / (max_val - min_val)
|
292 |
-
|
293 |
-
for repo in filtered_candidates:
|
294 |
-
norm_sem = normalize(repo.get("semantic_similarity", 0), min_sem, max_sem)
|
295 |
-
norm_star = normalize(math.log(repo.get("stars", 0) + 1), min_star, max_star)
|
296 |
-
repo["final_score"] = 0.6 * norm_sem + 0.4 * norm_star
|
297 |
-
|
298 |
-
final_ranked = sorted(filtered_candidates, key=lambda x: x["final_score"], reverse=True)
|
299 |
-
logger.info(f"Stage 3 complete: Final ranking computed for {len(final_ranked)} candidates.")
|
300 |
-
|
301 |
-
# Stage 4: Final Justification using ChatGroq
|
302 |
-
justifications = {}
|
303 |
-
for repo in final_ranked[:10]:
|
304 |
-
justification = justify_candidate(repo, user_query)
|
305 |
-
justifications[repo['title']] = justification
|
306 |
-
logger.info(f"Justification for {repo['title']}: {justification}")
|
307 |
-
|
308 |
-
# Format final results into a text table.
|
309 |
-
result_text = "\n=== Final Ranked Repositories ===\n"
|
310 |
-
for rank, repo in enumerate(final_ranked[:10], 1):
|
311 |
-
result_text += f"Final Rank: {rank}\n"
|
312 |
-
result_text += f"Title: {repo['title']}\n"
|
313 |
-
result_text += f"Link: {repo['link']}\n"
|
314 |
-
result_text += f"Stars: {repo['stars']}\n"
|
315 |
-
result_text += f"Semantic Similarity: {repo.get('semantic_similarity', 0):.4f}\n"
|
316 |
-
result_text += f"Final Score: {repo.get('final_score', 0):.4f}\n"
|
317 |
-
result_text += f"Justification: {justifications.get(repo['title'], 'No justification available')}\n"
|
318 |
-
result_text += f"Combined Doc Snippet: {repo['combined_doc'][:200]}...\n"
|
319 |
-
result_text += '-' * 80 + "\n"
|
320 |
-
result_text += "\n=== End of Results ==="
|
321 |
-
|
322 |
-
return result_text
|
323 |
-
|
324 |
-
# For debugging: if run directly, execute with an example query.
|
325 |
-
if __name__ == "__main__":
|
326 |
-
test_query = "I am looking for repositories for data augmentation pipelines for fine-tuning LLMs"
|
327 |
-
print(run_deepgit_lite(test_query))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|