zamal commited on
Commit
0f63890
·
verified ·
1 Parent(s): f8f258d

Delete src/deepgit_lite.py

Browse files
Files changed (1) hide show
  1. src/deepgit_lite.py +0 -327
src/deepgit_lite.py DELETED
@@ -1,327 +0,0 @@
1
- import os
2
- import base64
3
- import requests
4
- import numpy as np
5
- import datetime
6
- from sentence_transformers import SentenceTransformer
7
- import faiss
8
- import math
9
- import logging
10
- from dotenv import load_dotenv
11
- from pathlib import Path
12
- from langchain_groq import ChatGroq
13
- from langchain_core.prompts import ChatPromptTemplate
14
- import re
15
- import getpass
16
-
17
- # ---------------------------
18
- # Environment and .env Setup
19
- # ---------------------------
20
- dotenv_path = Path(__file__).resolve().parents[1] / ".env"
21
- if dotenv_path.exists():
22
- load_dotenv(dotenv_path=dotenv_path)
23
-
24
- if "GITHUB_API_KEY" not in os.environ:
25
- raise EnvironmentError("GITHUB_API_KEY not set in environment. Please set it as an environment variable.")
26
-
27
- # Optionally, silence bitsandbytes warnings if desired.
28
- os.environ["BITSANDBYTES_NOWARN"] = "1"
29
-
30
- # ---------------------------
31
- # Logging Setup
32
- # ---------------------------
33
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
34
- logger = logging.getLogger(__name__)
35
-
36
- # ---------------------------
37
- # ChatGroq Integration Setup (for query conversion and final justification)
38
- # ---------------------------
39
- llm_groq = ChatGroq(
40
- model="deepseek-r1-distill-llama-70b",
41
- temperature=0.2,
42
- max_tokens=800,
43
- timeout=15,
44
- max_retries=2
45
- )
46
-
47
- # --- Query Conversion Functions ---
48
- prompt = ChatPromptTemplate.from_messages([
49
- ("system",
50
- """You are a GitHub search optimization expert.
51
-
52
- Your job is to:
53
- 1. Read a user's query about tools, research, or tasks.
54
- 2. Detect if the query mentions a specific programming language other than Python (for example, JavaScript or JS). If so, record that language as the target language.
55
- 3. Think iteratively and generate your internal chain-of-thought enclosed in <think> ... </think> tags.
56
- 4. After your internal reasoning, output up to five GitHub-style search tags or library names that maximize repository discovery.
57
- Use as many tags as necessary based on the query's complexity, but never more than five.
58
- 5. If you detected a non-Python target language, append an additional tag at the end in the format target-[language] (e.g., target-javascript).
59
- If no specific language is mentioned, do not include any target tag.
60
-
61
- Output Format:
62
- tag1:tag2[:tag3[:tag4[:tag5[:target-language]]]]
63
-
64
- Rules:
65
- - Use lowercase and hyphenated keywords (e.g., image-augmentation, chain-of-thought).
66
- - Use terms commonly found in GitHub repo names, topics, or descriptions.
67
- - Avoid generic terms like "python", "ai", "tool", "project".
68
- - Do NOT use full phrases or vague words like "no-code", "framework", or "approach".
69
- - Prefer real tools, popular methods, or dataset names when mentioned.
70
- - If your output does not strictly match the required format, correct it after your internal reasoning.
71
- - Choose high-signal keywords to ensure the search yields the most relevant GitHub repositories.
72
-
73
- Output must be ONLY the search tags separated by colons. Do not include any extra text, bullet points, or explanations.
74
- """),
75
- ("human", "{query}")
76
- ])
77
- chain = prompt | llm_groq
78
-
79
- def parse_search_tags(response: str) -> str:
80
- """
81
- Removes any internal commentary enclosed in <think> ... </think> tags using regex,
82
- and returns only the final searchable tags.
83
- """
84
- cleaned = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL).strip()
85
- return cleaned
86
-
87
- def valid_tags(tags: str) -> bool:
88
- """
89
- Validates that the output is one to six colon-separated tokens composed of lowercase letters, numbers, and hyphens.
90
- """
91
- pattern = r'^[a-z0-9-]+(?::[a-z0-9-]+){0,5}$'
92
- return re.match(pattern, tags) is not None
93
-
94
- def iterative_convert_to_search_tags(query: str, max_iterations: int = 2) -> str:
95
- print(f"\n🧠 [iterative_convert_to_search_tags] Input Query: {query}")
96
- refined_query = query
97
- tags_output = ""
98
- for iteration in range(max_iterations):
99
- print(f"\n🔄 Iteration {iteration+1}")
100
- response = chain.invoke({"query": refined_query})
101
- full_output = response.content.strip()
102
- tags_output = parse_search_tags(full_output)
103
- print(f"Output Tags: {tags_output}")
104
- if valid_tags(tags_output):
105
- print("✅ Valid tags format detected.")
106
- return tags_output
107
- else:
108
- print("⚠️ Invalid tags format. Requesting refinement...")
109
- refined_query = f"{query}\nPlease refine your answer so that the output strictly matches the format: tag1:tag2[:tag3[:tag4[:tag5[:target-language]]]]."
110
- print("Final output (may be invalid):", tags_output)
111
- # Fallback default tags if output is still invalid
112
- fallback = "data-augmentation:llm-fine-tuning"
113
- print(f"Using fallback search tags: {fallback}")
114
- return fallback
115
-
116
- # --- Justification Function ---
117
- def justify_candidate(candidate, query):
118
- prompt = f"""You are a highly knowledgeable AI research assistant. In one to two lines, explain why the repository titled "{candidate['title']}" is a good match for a query on "{query}". Mention key factors such as documentation quality and community validation if relevant.
119
-
120
- Repository Details:
121
- - Stars: {candidate['stars']}
122
- - Semantic Similarity: {candidate.get('semantic_similarity', 0):.4f}
123
-
124
- Provide a concise justification:"""
125
- messages = [
126
- ("system", "You are a highly knowledgeable AI research assistant that can succinctly justify repository matches."),
127
- ("human", prompt)
128
- ]
129
- result = llm_groq.invoke(messages)
130
- if hasattr(result, "content"):
131
- return result.content
132
- return str(result)
133
-
134
- # ---------------------------
135
- # GitHub API Helper Functions
136
- # ---------------------------
137
- def fetch_readme_content(repo_full_name, headers):
138
- readme_url = f"https://api.github.com/repos/{repo_full_name}/readme"
139
- response = requests.get(readme_url, headers=headers)
140
- if response.status_code == 200:
141
- readme_data = response.json()
142
- return base64.b64decode(readme_data.get('content', '')).decode('utf-8')
143
- return ""
144
-
145
- def fetch_file_content(download_url):
146
- try:
147
- response = requests.get(download_url)
148
- if response.status_code == 200:
149
- return response.text
150
- except Exception as e:
151
- logger.error(f"Error fetching file: {e}")
152
- return ""
153
-
154
- def fetch_directory_markdown(repo_full_name, path, headers):
155
- md_content = ""
156
- url = f"https://api.github.com/repos/{repo_full_name}/contents/{path}"
157
- response = requests.get(url, headers=headers)
158
- if response.status_code == 200:
159
- items = response.json()
160
- for item in items:
161
- if item["type"] == "file" and item["name"].lower().endswith(".md"):
162
- content = fetch_file_content(item["download_url"])
163
- md_content += f"\n\n# {item['name']}\n" + content
164
- return md_content
165
-
166
- def fetch_repo_documentation(repo_full_name, headers):
167
- doc_text = ""
168
- readme = fetch_readme_content(repo_full_name, headers)
169
- if readme:
170
- doc_text += "# README\n" + readme
171
- root_url = f"https://api.github.com/repos/{repo_full_name}/contents"
172
- response = requests.get(root_url, headers=headers)
173
- if response.status_code == 200:
174
- items = response.json()
175
- for item in items:
176
- if item["type"] == "file" and item["name"].lower().endswith(".md"):
177
- if item["name"].lower() != "readme.md":
178
- content = fetch_file_content(item["download_url"])
179
- doc_text += f"\n\n# {item['name']}\n" + content
180
- elif item["type"] == "dir" and item["name"].lower() in ["docs", "documentation"]:
181
- doc_text += f"\n\n# {item['name']} folder\n" + fetch_directory_markdown(repo_full_name, item["name"], headers)
182
- return doc_text if doc_text.strip() else "No documentation available."
183
-
184
- def fetch_github_repositories(query, max_results=1000, per_page=100):
185
- url = "https://api.github.com/search/repositories"
186
- headers = {
187
- "Authorization": f"token {os.getenv('GITHUB_API_KEY')}",
188
- "Accept": "application/vnd.github.v3+json"
189
- }
190
- repositories = []
191
- num_pages = max_results // per_page
192
- for page in range(1, num_pages + 1):
193
- params = {
194
- "q": query,
195
- "sort": "stars",
196
- "order": "desc",
197
- "per_page": per_page,
198
- "page": page
199
- }
200
- response = requests.get(url, headers=headers, params=params)
201
- if response.status_code != 200:
202
- logger.error(f"Error {response.status_code}: {response.json().get('message')}")
203
- break
204
- items = response.json().get('items', [])
205
- if not items:
206
- break
207
- for repo in items:
208
- repo_link = repo['html_url']
209
- full_name = repo.get('full_name', '')
210
- doc_content = fetch_repo_documentation(full_name, headers)
211
- star_count = repo.get('stargazers_count', 0)
212
- repositories.append({
213
- "title": repo.get('name', 'No title available'),
214
- "link": repo_link,
215
- "combined_doc": doc_content,
216
- "stars": star_count,
217
- "full_name": full_name,
218
- "open_issues_count": repo.get('open_issues_count', 0)
219
- })
220
- logger.info(f"Fetched {len(repositories)} repositories from GitHub.")
221
- return repositories
222
-
223
- # ---------------------------
224
- # Main Lite Workflow Function
225
- # ---------------------------
226
- def run_deepgit_lite(user_query):
227
- # Stage 0: Query Conversion using iterative_convert_to_search_tags
228
- logger.info("Converting query to searchable tags...")
229
- original_query = user_query.strip()
230
- search_tags = iterative_convert_to_search_tags(original_query)
231
- logger.info(f"Search Tags: {search_tags}")
232
- # Convert colon-separated tags into a space-separated query string.
233
- tag_list = [tag.strip() for tag in search_tags.split(":") if tag.strip()]
234
- github_query = " ".join(tag_list) + " language:python"
235
- logger.info(f"Using GitHub query: {github_query}")
236
-
237
- # Stage 1: Dense Retrieval with FAISS - Fetch repositories using the query.
238
- logger.info("Fetching repositories from GitHub...")
239
- repos = fetch_github_repositories(github_query)
240
- if not repos:
241
- logger.warning("No repositories found with converted query. Falling back to default query.")
242
- fallback_query = "data augmentation language:python"
243
- logger.info(f"Using fallback GitHub query: {fallback_query}")
244
- repos = fetch_github_repositories(fallback_query)
245
- if not repos:
246
- logger.error("No repositories found with fallback query either.")
247
- return "\nNo repositories found for your query. Please try a different query."
248
-
249
- docs = [repo.get("combined_doc", "") for repo in repos]
250
- logger.info(f"Encoding {len(docs)} documents for dense retrieval...")
251
- sem_model = SentenceTransformer("all-mpnet-base-v2", device="cpu")
252
- doc_embeddings = sem_model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16)
253
-
254
- if doc_embeddings.ndim < 2 or doc_embeddings.shape[0] == 0:
255
- logger.error("No document embeddings generated. Aborting dense retrieval.")
256
- return "\nFailed to generate document embeddings. Please try again."
257
-
258
- def normalize_embeddings(embeddings):
259
- norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
260
- return embeddings / (norms + 1e-10)
261
-
262
- doc_embeddings = normalize_embeddings(doc_embeddings)
263
- query_embedding = sem_model.encode(user_query, convert_to_numpy=True)
264
- query_embedding = normalize_embeddings(np.expand_dims(query_embedding, axis=0))[0]
265
- dim = doc_embeddings.shape[1]
266
- index = faiss.IndexFlatIP(dim)
267
- index.add(doc_embeddings)
268
- k = min(100, doc_embeddings.shape[0])
269
- D, I = index.search(np.expand_dims(query_embedding, axis=0), k)
270
- for idx, score in zip(I[0], D[0]):
271
- repos[idx]["semantic_similarity"] = score
272
- ranked_by_semantic = sorted(repos, key=lambda x: x.get("semantic_similarity", 0), reverse=True)
273
- logger.info(f"Stage 1 complete: {len(ranked_by_semantic)} candidates ranked by semantic similarity.")
274
-
275
- # Stage 2: Filtering Low-Star Repositories
276
- filtered_candidates = [repo for repo in ranked_by_semantic if repo["stars"] >= 50]
277
- if not filtered_candidates:
278
- filtered_candidates = ranked_by_semantic
279
- logger.info(f"Stage 2 complete: {len(filtered_candidates)} candidates remain after filtering low-star repositories.")
280
-
281
- # Stage 3: Combine Scores for Final Ranking (Using Semantic Similarity and Stars Only)
282
- semantic_scores = [repo.get("semantic_similarity", 0) for repo in filtered_candidates]
283
- star_scores = [math.log(repo.get("stars", 0) + 1) for repo in filtered_candidates]
284
-
285
- min_sem, max_sem = min(semantic_scores), max(semantic_scores)
286
- min_star, max_star = min(star_scores), max(star_scores)
287
-
288
- def normalize(val, min_val, max_val):
289
- if max_val - min_val == 0:
290
- return 0.5
291
- return (val - min_val) / (max_val - min_val)
292
-
293
- for repo in filtered_candidates:
294
- norm_sem = normalize(repo.get("semantic_similarity", 0), min_sem, max_sem)
295
- norm_star = normalize(math.log(repo.get("stars", 0) + 1), min_star, max_star)
296
- repo["final_score"] = 0.6 * norm_sem + 0.4 * norm_star
297
-
298
- final_ranked = sorted(filtered_candidates, key=lambda x: x["final_score"], reverse=True)
299
- logger.info(f"Stage 3 complete: Final ranking computed for {len(final_ranked)} candidates.")
300
-
301
- # Stage 4: Final Justification using ChatGroq
302
- justifications = {}
303
- for repo in final_ranked[:10]:
304
- justification = justify_candidate(repo, user_query)
305
- justifications[repo['title']] = justification
306
- logger.info(f"Justification for {repo['title']}: {justification}")
307
-
308
- # Format final results into a text table.
309
- result_text = "\n=== Final Ranked Repositories ===\n"
310
- for rank, repo in enumerate(final_ranked[:10], 1):
311
- result_text += f"Final Rank: {rank}\n"
312
- result_text += f"Title: {repo['title']}\n"
313
- result_text += f"Link: {repo['link']}\n"
314
- result_text += f"Stars: {repo['stars']}\n"
315
- result_text += f"Semantic Similarity: {repo.get('semantic_similarity', 0):.4f}\n"
316
- result_text += f"Final Score: {repo.get('final_score', 0):.4f}\n"
317
- result_text += f"Justification: {justifications.get(repo['title'], 'No justification available')}\n"
318
- result_text += f"Combined Doc Snippet: {repo['combined_doc'][:200]}...\n"
319
- result_text += '-' * 80 + "\n"
320
- result_text += "\n=== End of Results ==="
321
-
322
- return result_text
323
-
324
- # For debugging: if run directly, execute with an example query.
325
- if __name__ == "__main__":
326
- test_query = "I am looking for repositories for data augmentation pipelines for fine-tuning LLMs"
327
- print(run_deepgit_lite(test_query))