acecalisto3 commited on
Commit
b1e9534
·
verified ·
1 Parent(s): 2e88bce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +282 -34
app.py CHANGED
@@ -1,24 +1,183 @@
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
2
  import langchain
3
- import huggingface_hub
4
  import dotenv
5
  import yaml
6
- import os
7
  from typing import Optional, Union, List, Dict, Any, Tuple
8
  import subprocess
9
  from pathlib import Path
10
  import json
11
  import tempfile
12
- from datetime import datetime, timezone
13
  import re
14
- import requests
15
  import logging
16
  import shutil
17
 
18
- # Configure logging
 
 
19
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
20
  logger = logging.getLogger(__name__)
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  class TerminalCommand:
23
  @staticmethod
24
  def execute(command: Union[str, List[str]], cwd: Optional[str] = None) -> Tuple[str, str, int]:
@@ -47,7 +206,7 @@ class GitUtilities:
47
  self.repo_path = Path(repo_path)
48
 
49
  def clone(self, url: str, branch: str = "main") -> bool:
50
- """Clone a repository"""
51
  stdout, stderr, code = TerminalCommand.execute(
52
  f"git clone -b {branch} {url} {self.repo_path}"
53
  )
@@ -56,9 +215,9 @@ class GitUtilities:
56
  return code == 0
57
 
58
  def commit(self, message: str) -> bool:
59
- """Create a commit with the given message"""
60
  stdout, stderr, code = TerminalCommand.execute(
61
- ["git", "commit", "-m", message],
62
  str(self.repo_path)
63
  )
64
  if code != 0:
@@ -66,7 +225,7 @@ class GitUtilities:
66
  return code == 0
67
 
68
  def push(self, remote: str = "origin", branch: str = "main") -> bool:
69
- """Push changes to remote"""
70
  stdout, stderr, code = TerminalCommand.execute(
71
  ["git", "push", remote, branch],
72
  str(self.repo_path)
@@ -76,7 +235,7 @@ class GitUtilities:
76
  return code == 0
77
 
78
  def create_branch(self, branch_name: str) -> bool:
79
- """Create and checkout a new branch"""
80
  stdout, stderr, code = TerminalCommand.execute(
81
  ["git", "checkout", "-b", branch_name],
82
  str(self.repo_path)
@@ -119,13 +278,21 @@ class GitHubBot:
119
  logger.error(f"Error creating pull request: {e}")
120
  raise
121
 
122
- def resolve_issue(self, token: str, owner: str, repo: str, issue_number: int, resolution: str, forked_repo: str) -> str:
123
- """Resolve a GitHub issue."""
 
 
 
 
 
 
 
 
124
  try:
125
  self.initialize_api(token)
126
- branch_name = f"fix/issue-{issue_number}-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
127
 
128
- # Clone repository
129
  if not self.git.clone(forked_repo):
130
  raise Exception("Failed to clone repository")
131
 
@@ -136,7 +303,7 @@ class GitHubBot:
136
  # Generate resolution content
137
  resolution_content = self._create_resolution_document(issue_number, resolution)
138
 
139
- # Save resolution file
140
  resolution_path = Path(self.temp_dir) / f"resolution_{issue_number}.md"
141
  with open(resolution_path, "w") as f:
142
  f.write(resolution_content)
@@ -144,6 +311,7 @@ class GitHubBot:
144
  # Commit and push changes
145
  if not self.git.commit(f"Fix for issue #{issue_number}"):
146
  raise Exception("Failed to commit changes")
 
147
  if not self.git.push("origin", branch_name):
148
  raise Exception("Failed to push changes")
149
 
@@ -155,7 +323,6 @@ class GitHubBot:
155
  body="This PR resolves the reported issue with the following resolution.",
156
  head=branch_name
157
  )
158
-
159
  return f"Pull request created: {pr['html_url']}"
160
  except Exception as e:
161
  logger.error(f"Error resolving issue #{issue_number}: {e}")
@@ -165,22 +332,65 @@ class GitHubBot:
165
  shutil.rmtree(self.temp_dir)
166
 
167
  def _create_resolution_document(self, issue_number: int, resolution: str) -> str:
168
- """Create a resolution document."""
169
  return f"""# Resolution for Issue #{issue_number}
170
  ## Resolution Details
171
  {resolution}
172
  ## Metadata
173
- - Date: {datetime.now(timezone.utc).isoformat()}
174
  - Resolved By: Automated System
175
  """
176
 
177
- def create_gradio_interface():
178
- """Create the Gradio interface."""
 
 
 
 
 
 
 
 
179
  bot = GitHubBot(logger)
180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  def on_resolve(token, repo_url, issue_number, resolution, forked_repo):
 
 
 
182
  try:
183
  parts = repo_url.strip("/").split("/")
 
184
  owner, repo = parts[-2], parts[-1]
185
  result = bot.resolve_issue(token, owner, repo, int(issue_number), resolution, forked_repo)
186
  return result
@@ -188,29 +398,67 @@ def create_gradio_interface():
188
  logger.error(f"Error in issue resolution: {e}")
189
  return f"Error: {e}"
190
 
191
- with gr.Blocks() as demo:
192
- gr.Markdown("# GitHub Issue Resolver")
193
- gr.Markdown("Resolve GitHub issues with AI assistance and Git integration.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
- with gr.Tab("Issue Resolution"):
196
- with gr.Row():
197
- token_input = gr.Textbox(label="GitHub Token", placeholder="Enter your GitHub token")
198
- repo_url_input = gr.Textbox(label="Repository URL", placeholder="Enter the repository URL")
199
- issue_number_input = gr.Number(label="Issue Number", precision=0)
200
- resolution_input = gr.Textbox(label="Resolution", placeholder="Describe the resolution for the issue")
201
- forked_repo_input = gr.Textbox(label="Forked Repo URL", placeholder="Enter the forked repository URL")
202
 
 
 
 
 
 
 
 
 
 
 
 
203
  resolve_button = gr.Button("Resolve Issue")
204
  result_output = gr.Textbox(label="Result", interactive=False)
205
 
206
  resolve_button.click(
207
  fn=on_resolve,
208
- inputs=[token_input, repo_url_input, issue_number_input, resolution_input, forked_repo_input],
 
 
 
 
 
 
209
  outputs=[result_output]
210
  )
211
 
212
- return demo
213
 
214
  if __name__ == "__main__":
215
- demo = create_gradio_interface()
216
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
+ import os
2
  import gradio as gr
3
+ import requests
4
+ import uuid
5
+ from huggingface_hub import InferenceClient, HfApi
6
+ from pypdf import PdfReader
7
+ from bs4 import BeautifulSoup
8
+ import datetime
9
+ import zipfile
10
+ import nltk.data
11
+ import nltk
12
  import langchain
 
13
  import dotenv
14
  import yaml
 
15
  from typing import Optional, Union, List, Dict, Any, Tuple
16
  import subprocess
17
  from pathlib import Path
18
  import json
19
  import tempfile
20
+ from datetime import datetime as dt, timezone
21
  import re
 
22
  import logging
23
  import shutil
24
 
25
+ # -----------------------
26
+ # ENV / Logging Setup
27
+ # -----------------------
28
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
29
  logger = logging.getLogger(__name__)
30
 
31
+ # Ensure the 'punkt' tokenizer is downloaded only if missing
32
+ try:
33
+ nltk.data.find('tokenizers/punkt')
34
+ except LookupError:
35
+ nltk.download('punkt')
36
+
37
+ VERBOSE = True
38
+ def log(message):
39
+ if VERBOSE:
40
+ print(f"[LOG] {datetime.datetime.now()} - {message}")
41
+
42
+ # -----------------------
43
+ # 1) Scraper/Indexer/Dataset Generator - from your first script
44
+ # -----------------------
45
+
46
+ # == Hugging Face API Setup ==
47
+ HF_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
48
+ HF_TOKEN = os.environ.get('HF_TOKEN')
49
+ if not HF_TOKEN:
50
+ raise EnvironmentError("HF_TOKEN is not set. Please export it as an environment variable.")
51
+
52
+ try:
53
+ client = InferenceClient(HF_MODEL)
54
+ api = HfApi(token=HF_TOKEN)
55
+ log("Initialized Hugging Face client and API.")
56
+ except Exception as e:
57
+ log(f"Error initializing Hugging Face client: {e}")
58
+ exit(1)
59
+
60
+ REPO_NAME = "acecalisto3/tmp"
61
+ DATASET_URL = f"https://huggingface.co/datasets/{REPO_NAME}/raw/main/"
62
+ MAX_TOKENS = 8192
63
+
64
+ def read_pdf(file_path):
65
+ """Read PDF and return its text."""
66
+ try:
67
+ reader = PdfReader(file_path)
68
+ text = "\n".join(page.extract_text() for page in reader.pages)
69
+ return text
70
+ except Exception as e:
71
+ log(f"Error reading PDF {file_path}: {e}")
72
+ return ""
73
+
74
+ def fetch_url(url, max_depth):
75
+ """Breadth-first search crawl to a given depth, collecting text."""
76
+ visited = set()
77
+ to_visit = [(url, 0)]
78
+ results = []
79
+ while to_visit:
80
+ current_url, depth = to_visit.pop(0)
81
+ if current_url in visited:
82
+ continue
83
+ if depth < max_depth:
84
+ try:
85
+ response = requests.get(current_url, timeout=10)
86
+ response.raise_for_status()
87
+ visited.add(current_url)
88
+ soup = BeautifulSoup(response.content, 'lxml')
89
+ results.append(soup.get_text())
90
+ for link in soup.find_all("a", href=True):
91
+ absolute_url = requests.compat.urljoin(current_url, link.get('href'))
92
+ if absolute_url.startswith("http") and absolute_url not in visited:
93
+ to_visit.append((absolute_url, depth + 1))
94
+ except Exception as e:
95
+ log(f"Error fetching {current_url}: {e}")
96
+ return "\n".join(results)
97
+
98
+ def read_txt(txt_path):
99
+ """Read text file."""
100
+ try:
101
+ with open(txt_path, "r", encoding="utf-8") as f:
102
+ return f.read()
103
+ except Exception as e:
104
+ log(f"Error reading TXT file {txt_path}: {e}")
105
+ return ""
106
+
107
+ def read_zip(zip_path):
108
+ """Read all .txt/.pdf files inside a ZIP."""
109
+ try:
110
+ extracted_data = []
111
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
112
+ for file_info in zip_ref.infolist():
113
+ if file_info.filename.endswith((".txt", ".pdf")):
114
+ with zip_ref.open(file_info) as file:
115
+ content = file.read()
116
+ if file_info.filename.endswith(".txt"):
117
+ extracted_data.append(content.decode("utf-8"))
118
+ elif file_info.filename.endswith(".pdf"):
119
+ temp_path = f"/tmp/{uuid.uuid4()}"
120
+ with open(temp_path, "wb") as temp_file:
121
+ temp_file.write(content)
122
+ extracted_data.append(read_pdf(temp_path))
123
+ os.remove(temp_path)
124
+ return "\n".join(extracted_data)
125
+ except Exception as e:
126
+ log(f"Error reading ZIP file {zip_path}: {e}")
127
+ return ""
128
+
129
+ def process_file(file):
130
+ """Depending on file extension, process file to extract text."""
131
+ try:
132
+ if file.name.endswith(".pdf"):
133
+ return read_pdf(file.name)
134
+ elif file.name.endswith(".txt"):
135
+ return read_txt(file.name)
136
+ elif file.name.endswith(".zip"):
137
+ return read_zip(file.name)
138
+ except Exception as e:
139
+ log(f"Error processing file {file.name}: {e}")
140
+ return ""
141
+
142
+ def chunk_text(text, max_chunk_size):
143
+ """Naive chunking based on sentence tokenizer to avoid huge tokens."""
144
+ tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
145
+ sentences = tokenizer.tokenize(text)
146
+ chunks = []
147
+ current_chunk = ""
148
+ for sentence in sentences:
149
+ if len(current_chunk) + len(sentence) + 1 > max_chunk_size:
150
+ chunks.append(current_chunk.strip())
151
+ current_chunk = ""
152
+ current_chunk += sentence + " "
153
+ if current_chunk:
154
+ chunks.append(current_chunk.strip())
155
+ return chunks
156
+
157
+ def extract_dataset(data, instructions="Extract {history}", max_tokens=MAX_TOKENS):
158
+ """Call text generation on each chunk with a certain instruction."""
159
+ extracted = []
160
+ chunks = chunk_text(data, 20000) # Adjust chunk size as needed
161
+ for i, chunk in enumerate(chunks):
162
+ try:
163
+ response = client.text_generation(
164
+ prompt=instructions.format(history=chunk),
165
+ max_new_tokens=max_tokens
166
+ )
167
+ extracted.append(response["generated_text"])
168
+ except Exception as e:
169
+ log(f"Error processing chunk {i+1}: {e}")
170
+ extracted.append(f"Error processing chunk {i+1}: {e}")
171
+ return "\n".join(extracted)
172
+
173
+ def combine_datasets(datasets):
174
+ """Simply combine multiple dataset strings into one big string."""
175
+ return "\n".join(datasets)
176
+
177
+ # -----------------------
178
+ # 2) GitHub Issue Resolver - from your second script
179
+ # -----------------------
180
+
181
  class TerminalCommand:
182
  @staticmethod
183
  def execute(command: Union[str, List[str]], cwd: Optional[str] = None) -> Tuple[str, str, int]:
 
206
  self.repo_path = Path(repo_path)
207
 
208
  def clone(self, url: str, branch: str = "main") -> bool:
209
+ """Clone a repository."""
210
  stdout, stderr, code = TerminalCommand.execute(
211
  f"git clone -b {branch} {url} {self.repo_path}"
212
  )
 
215
  return code == 0
216
 
217
  def commit(self, message: str) -> bool:
218
+ """Create a commit with the given message."""
219
  stdout, stderr, code = TerminalCommand.execute(
220
+ ["git", "commit", "-am", message],
221
  str(self.repo_path)
222
  )
223
  if code != 0:
 
225
  return code == 0
226
 
227
  def push(self, remote: str = "origin", branch: str = "main") -> bool:
228
+ """Push changes to remote."""
229
  stdout, stderr, code = TerminalCommand.execute(
230
  ["git", "push", remote, branch],
231
  str(self.repo_path)
 
235
  return code == 0
236
 
237
  def create_branch(self, branch_name: str) -> bool:
238
+ """Create and checkout a new branch."""
239
  stdout, stderr, code = TerminalCommand.execute(
240
  ["git", "checkout", "-b", branch_name],
241
  str(self.repo_path)
 
278
  logger.error(f"Error creating pull request: {e}")
279
  raise
280
 
281
+ def resolve_issue(
282
+ self,
283
+ token: str,
284
+ owner: str,
285
+ repo: str,
286
+ issue_number: int,
287
+ resolution: str,
288
+ forked_repo: str
289
+ ) -> str:
290
+ """Resolve a GitHub issue by cloning, creating a fix branch, and opening a PR."""
291
  try:
292
  self.initialize_api(token)
293
+ branch_name = f"fix/issue-{issue_number}-{dt.now().strftime('%Y%m%d-%H%M%S')}"
294
 
295
+ # Clone repository (forked repo URL is expected)
296
  if not self.git.clone(forked_repo):
297
  raise Exception("Failed to clone repository")
298
 
 
303
  # Generate resolution content
304
  resolution_content = self._create_resolution_document(issue_number, resolution)
305
 
306
+ # Save resolution file (as an example, you can adjust)
307
  resolution_path = Path(self.temp_dir) / f"resolution_{issue_number}.md"
308
  with open(resolution_path, "w") as f:
309
  f.write(resolution_content)
 
311
  # Commit and push changes
312
  if not self.git.commit(f"Fix for issue #{issue_number}"):
313
  raise Exception("Failed to commit changes")
314
+
315
  if not self.git.push("origin", branch_name):
316
  raise Exception("Failed to push changes")
317
 
 
323
  body="This PR resolves the reported issue with the following resolution.",
324
  head=branch_name
325
  )
 
326
  return f"Pull request created: {pr['html_url']}"
327
  except Exception as e:
328
  logger.error(f"Error resolving issue #{issue_number}: {e}")
 
332
  shutil.rmtree(self.temp_dir)
333
 
334
  def _create_resolution_document(self, issue_number: int, resolution: str) -> str:
335
+ """Create a resolution document for the fix."""
336
  return f"""# Resolution for Issue #{issue_number}
337
  ## Resolution Details
338
  {resolution}
339
  ## Metadata
340
+ - Date: {dt.now(timezone.utc).isoformat()}
341
  - Resolved By: Automated System
342
  """
343
 
344
+ # -----------------------
345
+ # 3) Build the combined Gradio interface with two tabs
346
+ # -----------------------
347
+
348
+ def create_combined_gradio_app():
349
+ """
350
+ Create one Gradio interface that has two tabs:
351
+ 1) 'Scraper/Indexer/Dataset Generator'
352
+ 2) 'GitHub Issue Resolver'
353
+ """
354
  bot = GitHubBot(logger)
355
 
356
+ # 3.1) Functions for the first tab (Scraper/Indexer/Dataset Generator)
357
+ def process_workflow(command, data, files, url, depth):
358
+ datasets = []
359
+ errors = []
360
+ try:
361
+ # If user enters text in the data_input box
362
+ if data:
363
+ datasets.append(data)
364
+
365
+ # If user uploads any files
366
+ if files:
367
+ for file in files:
368
+ datasets.append(process_file(file))
369
+
370
+ # If user supplies a URL
371
+ if url:
372
+ datasets.append(fetch_url(url, max_depth=depth))
373
+
374
+ # Depending on the command chosen, do the logic
375
+ if command == "Extract Dataset":
376
+ return {"datasets": extract_dataset("\n".join(datasets))}, ""
377
+ elif command == "Combine Datasets":
378
+ return {"datasets": combine_datasets(datasets)}, ""
379
+
380
+ # Default: if "Scrape Data" or "Train Chatbot" or unknown
381
+ return {"datasets": datasets}, ""
382
+ except Exception as e:
383
+ errors.append(str(e))
384
+ return {"datasets": []}, "\n".join(errors)
385
+
386
+ # 3.2) Functions for the second tab (GitHub Issue Resolver)
387
  def on_resolve(token, repo_url, issue_number, resolution, forked_repo):
388
+ """
389
+ This callback is used when a user clicks 'Resolve Issue' in the second tab.
390
+ """
391
  try:
392
  parts = repo_url.strip("/").split("/")
393
+ # Typically, the repo URL is something like https://github.com/owner/repo
394
  owner, repo = parts[-2], parts[-1]
395
  result = bot.resolve_issue(token, owner, repo, int(issue_number), resolution, forked_repo)
396
  return result
 
398
  logger.error(f"Error in issue resolution: {e}")
399
  return f"Error: {e}"
400
 
401
+ with gr.Blocks() as main_app:
402
+ # Title / Header
403
+ gr.Markdown("## Combined System: Scraper/Indexer/Dataset Generator & GitHub Issue Resolver")
404
+
405
+ with gr.Tab("Scraper / Indexer / Dataset Generator"):
406
+ gr.Markdown(
407
+ "**Use this tab to upload files, scrape data from URLs, or enter text to generate datasets.**"
408
+ )
409
+
410
+ # The UI from your first script
411
+ chatbot = gr.Chatbot(label="Flash Trained Chatbot (Placeholder)")
412
+ command_selector = gr.Dropdown(
413
+ label="Select Command",
414
+ choices=["Scrape Data", "Extract Dataset", "Combine Datasets", "Train Chatbot"],
415
+ value="Scrape Data"
416
+ )
417
+ data_input = gr.Textbox(label="Input Text", placeholder="Enter text here.")
418
+ file_upload = gr.Files(label="Upload Files", file_types=[".pdf", ".txt", ".zip"])
419
+ url_input = gr.Textbox(label="URL", placeholder="https://example.com")
420
+ depth_slider = gr.Slider(label="Crawl Depth", minimum=1, maximum=10, value=1)
421
+ output_json = gr.JSON(label="Output Dataset")
422
+ error_output = gr.Textbox(label="Error Log", interactive=False)
423
+ process_button = gr.Button("Process")
424
+
425
+ process_button.click(
426
+ process_workflow,
427
+ inputs=[command_selector, data_input, file_upload, url_input, depth_slider],
428
+ outputs=[output_json, error_output]
429
+ )
430
 
431
+ with gr.Tab("GitHub Issue Resolver"):
432
+ gr.Markdown("**Use this tab to resolve GitHub issues by cloning, fixing, and opening PRs.**")
 
 
 
 
 
433
 
434
+ token_input = gr.Textbox(label="GitHub Token", placeholder="Enter your GitHub token")
435
+ repo_url_input = gr.Textbox(label="Repository URL", placeholder="e.g. https://github.com/owner/repo")
436
+ issue_number_input = gr.Number(label="Issue Number", precision=0, value=1)
437
+ resolution_input = gr.Textbox(
438
+ label="Proposed Resolution",
439
+ placeholder="Describe the resolution for the issue here..."
440
+ )
441
+ forked_repo_input = gr.Textbox(
442
+ label="Forked Repo URL",
443
+ placeholder="e.g. https://github.com/youraccount/repo (your fork)"
444
+ )
445
  resolve_button = gr.Button("Resolve Issue")
446
  result_output = gr.Textbox(label="Result", interactive=False)
447
 
448
  resolve_button.click(
449
  fn=on_resolve,
450
+ inputs=[
451
+ token_input,
452
+ repo_url_input,
453
+ issue_number_input,
454
+ resolution_input,
455
+ forked_repo_input
456
+ ],
457
  outputs=[result_output]
458
  )
459
 
460
+ return main_app
461
 
462
  if __name__ == "__main__":
463
+ app = create_combined_gradio_app()
464
+ app.launch(server_name="0.0.0.0", server_port=7860)