acecalisto3 commited on
Commit
43cf74c
·
verified ·
1 Parent(s): 5cb59c2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -393
app.py CHANGED
@@ -2,149 +2,73 @@ import os
2
  import gradio as gr
3
  import requests
4
  import uuid
5
- from huggingface_hub import InferenceClient, HfApi
 
6
  from pypdf import PdfReader
7
  from bs4 import BeautifulSoup
8
- import datetime
9
  import zipfile
10
- import nltk.data
11
  import nltk
12
- import langchain
13
- import dotenv
14
- import yaml
15
- from typing import Optional, Union, List, Dict, Any, Tuple
16
- import subprocess
17
- from pathlib import Path
18
- import json
19
- import tempfile
20
- from datetime import datetime as dt, timezone
21
- import re
22
- import logging
23
- import shutil
24
-
25
- # -----------------------
26
- # ENV / Logging Setup
27
- # -----------------------
28
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
29
- logger = logging.getLogger(__name__)
30
 
31
- # Ensure the 'punkt' tokenizer is downloaded only if missing
32
  try:
33
  nltk.data.find('tokenizers/punkt')
34
  except LookupError:
35
  nltk.download('punkt')
36
 
37
- VERBOSE = True
38
- def log(message):
39
- if VERBOSE:
40
- print(f"[LOG] {datetime.datetime.now()} - {message}")
41
-
42
- # -----------------------
43
- # 1) Scraper/Indexer/Dataset Generator - from your first script
44
- # -----------------------
45
-
46
- # == Hugging Face API Setup ==
47
  HF_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
48
- HF_TOKEN = os.environ.get('HF_TOKEN')
49
- if not HF_TOKEN:
50
- raise EnvironmentError("HF_TOKEN is not set. Please export it as an environment variable.")
51
 
52
- try:
53
- client = InferenceClient(HF_MODEL)
54
- api = HfApi(token=HF_TOKEN)
55
- log("Initialized Hugging Face client and API.")
56
- except Exception as e:
57
- log(f"Error initializing Hugging Face client: {e}")
58
- exit(1)
59
 
60
- REPO_NAME = "acecalisto3/tmp"
61
- DATASET_URL = f"https://huggingface.co/datasets/{REPO_NAME}/raw/main/"
62
- MAX_TOKENS = 8192
63
-
64
- def read_pdf(file_path):
65
- """Read PDF and return its text."""
66
  try:
67
  reader = PdfReader(file_path)
68
- text = "\n".join(page.extract_text() for page in reader.pages)
69
- return text
70
- except Exception as e:
71
- log(f"Error reading PDF {file_path}: {e}")
72
- return ""
73
-
74
- def fetch_url(url, max_depth):
75
- """Breadth-first search crawl to a given depth, collecting text."""
76
- visited = set()
77
- to_visit = [(url, 0)]
78
- results = []
79
- while to_visit:
80
- current_url, depth = to_visit.pop(0)
81
- if current_url in visited:
82
- continue
83
- if depth < max_depth:
84
- try:
85
- response = requests.get(current_url, timeout=10)
86
- response.raise_for_status()
87
- visited.add(current_url)
88
- soup = BeautifulSoup(response.content, 'lxml')
89
- results.append(soup.get_text())
90
- for link in soup.find_all("a", href=True):
91
- absolute_url = requests.compat.urljoin(current_url, link.get('href'))
92
- if absolute_url.startswith("http") and absolute_url not in visited:
93
- to_visit.append((absolute_url, depth + 1))
94
- except Exception as e:
95
- log(f"Error fetching {current_url}: {e}")
96
- return "\n".join(results)
97
-
98
- def read_txt(txt_path):
99
- """Read text file."""
100
- try:
101
- with open(txt_path, "r", encoding="utf-8") as f:
102
- return f.read()
103
  except Exception as e:
104
- log(f"Error reading TXT file {txt_path}: {e}")
105
- return ""
106
 
107
- def read_zip(zip_path):
108
- """Read all .txt/.pdf files inside a ZIP."""
109
  try:
110
- extracted_data = []
111
- with zipfile.ZipFile(zip_path, 'r') as zip_ref:
112
- for file_info in zip_ref.infolist():
113
- if file_info.filename.endswith((".txt", ".pdf")):
114
- with zip_ref.open(file_info) as file:
115
- content = file.read()
116
- if file_info.filename.endswith(".txt"):
117
- extracted_data.append(content.decode("utf-8"))
118
- elif file_info.filename.endswith(".pdf"):
119
- temp_path = f"/tmp/{uuid.uuid4()}"
120
- with open(temp_path, "wb") as temp_file:
121
- temp_file.write(content)
122
- extracted_data.append(read_pdf(temp_path))
123
- os.remove(temp_path)
124
- return "\n".join(extracted_data)
125
  except Exception as e:
126
- log(f"Error reading ZIP file {zip_path}: {e}")
127
- return ""
128
 
129
- def process_file(file):
130
- """Depending on file extension, process file to extract text."""
131
  try:
132
  if file.name.endswith(".pdf"):
133
- return read_pdf(file.name)
134
  elif file.name.endswith(".txt"):
135
- return read_txt(file.name)
 
136
  elif file.name.endswith(".zip"):
137
- return read_zip(file.name)
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  except Exception as e:
139
- log(f"Error processing file {file.name}: {e}")
140
- return ""
141
 
142
- def chunk_text(text, max_chunk_size):
143
- """Naive chunking based on sentence tokenizer to avoid huge tokens."""
144
- tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
145
- sentences = tokenizer.tokenize(text)
146
- chunks = []
147
- current_chunk = ""
148
  for sentence in sentences:
149
  if len(current_chunk) + len(sentence) + 1 > max_chunk_size:
150
  chunks.append(current_chunk.strip())
@@ -154,311 +78,110 @@ def chunk_text(text, max_chunk_size):
154
  chunks.append(current_chunk.strip())
155
  return chunks
156
 
157
- def extract_dataset(data, instructions="Extract {history}", max_tokens=MAX_TOKENS):
158
- """Call text generation on each chunk with a certain instruction."""
159
  extracted = []
160
- chunks = chunk_text(data, 20000) # Adjust chunk size as needed
161
  for i, chunk in enumerate(chunks):
162
  try:
163
  response = client.text_generation(
164
  prompt=instructions.format(history=chunk),
165
- max_new_tokens=max_tokens
166
  )
167
  extracted.append(response["generated_text"])
168
  except Exception as e:
169
- log(f"Error processing chunk {i+1}: {e}")
170
- extracted.append(f"Error processing chunk {i+1}: {e}")
171
  return "\n".join(extracted)
172
 
173
- def combine_datasets(datasets):
174
- """Simply combine multiple dataset strings into one big string."""
175
- return "\n".join(datasets)
176
 
177
- # -----------------------
178
- # 2) GitHub Issue Resolver - from your second script
179
- # -----------------------
 
180
 
181
- class TerminalCommand:
182
- @staticmethod
183
- def execute(command: Union[str, List[str]], cwd: Optional[str] = None) -> Tuple[str, str, int]:
184
- """
185
- Execute a terminal command and return stdout, stderr, and return code
186
- """
187
- if isinstance(command, str):
188
- command = command.split()
189
 
190
- try:
191
- process = subprocess.Popen(
192
- command,
193
- stdout=subprocess.PIPE,
194
- stderr=subprocess.PIPE,
195
- cwd=cwd,
196
- text=True
197
- )
198
- stdout, stderr = process.communicate()
199
- return stdout.strip(), stderr.strip(), process.returncode
200
- except Exception as e:
201
- logger.error(f"Error executing command {command}: {e}")
202
- return "", str(e), 1
203
 
204
- class GitUtilities:
205
- def __init__(self, repo_path: str):
206
- self.repo_path = Path(repo_path)
207
 
208
- def clone(self, url: str, branch: str = "main") -> bool:
209
- """Clone a repository."""
210
- stdout, stderr, code = TerminalCommand.execute(
211
- f"git clone -b {branch} {url} {self.repo_path}"
212
- )
213
- if code != 0:
214
- logger.error(f"Git clone failed: {stderr}")
215
- return code == 0
216
 
217
- def commit(self, message: str) -> bool:
218
- """Create a commit with the given message."""
219
- stdout, stderr, code = TerminalCommand.execute(
220
- ["git", "commit", "-am", message],
221
- str(self.repo_path)
222
- )
223
- if code != 0:
224
- logger.error(f"Git commit failed: {stderr}")
225
- return code == 0
226
 
227
- def push(self, remote: str = "origin", branch: str = "main") -> bool:
228
- """Push changes to remote."""
229
- stdout, stderr, code = TerminalCommand.execute(
230
- ["git", "push", remote, branch],
231
- str(self.repo_path)
232
- )
233
- if code != 0:
234
- logger.error(f"Git push failed: {stderr}")
235
- return code == 0
236
 
237
- def create_branch(self, branch_name: str) -> bool:
238
- """Create and checkout a new branch."""
239
- stdout, stderr, code = TerminalCommand.execute(
240
- ["git", "checkout", "-b", branch_name],
241
- str(self.repo_path)
242
- )
243
- if code != 0:
244
- logger.error(f"Git branch creation failed: {stderr}")
245
- return code == 0
246
 
247
- class GitHubBot:
248
- def __init__(self, logger: logging.Logger):
249
- self.github_api = None
250
- self.logger = logger
251
- self.ai_provider = None
252
- self.git = None
253
- self.temp_dir = None
254
- self.base_url = "https://api.github.com"
255
 
256
- def initialize_api(self, token: str):
257
- """Initialize the GitHub API with a token."""
258
- if not token:
259
- raise ValueError("GitHub token is required.")
260
- self.github_api = {"Authorization": f"Bearer {token}"}
261
- self.temp_dir = tempfile.mkdtemp()
262
- self.git = GitUtilities(self.temp_dir)
263
 
264
- def create_pull_request(self, owner: str, repo: str, title: str, body: str, head: str, base: str = "main") -> Dict:
265
- """Create a pull request."""
266
- url = f"{self.base_url}/repos/{owner}/{repo}/pulls"
267
- data = {
268
- "title": title,
269
- "body": body,
270
- "head": head,
271
- "base": base
272
- }
273
- try:
274
- response = requests.post(url, headers=self.github_api, json=data)
275
- response.raise_for_status()
276
- return response.json()
277
- except requests.RequestException as e:
278
- logger.error(f"Error creating pull request: {e}")
279
- raise
280
 
281
- def resolve_issue(
282
- self,
283
- token: str,
284
- owner: str,
285
- repo: str,
286
- issue_number: int,
287
- resolution: str,
288
- forked_repo: str
289
- ) -> str:
290
- """Resolve a GitHub issue by cloning, creating a fix branch, and opening a PR."""
291
- try:
292
- self.initialize_api(token)
293
- branch_name = f"fix/issue-{issue_number}-{dt.now().strftime('%Y%m%d-%H%M%S')}"
294
 
295
- # Clone repository (forked repo URL is expected)
296
- if not self.git.clone(forked_repo):
297
- raise Exception("Failed to clone repository")
298
 
299
- # Create a new branch
300
- if not self.git.create_branch(branch_name):
301
- raise Exception("Failed to create branch")
302
 
303
- # Generate resolution content
304
- resolution_content = self._create_resolution_document(issue_number, resolution)
305
 
306
- # Save resolution file (as an example, you can adjust)
307
- resolution_path = Path(self.temp_dir) / f"resolution_{issue_number}.md"
308
- with open(resolution_path, "w") as f:
309
- f.write(resolution_content)
310
 
311
- # Commit and push changes
312
- if not self.git.commit(f"Fix for issue #{issue_number}"):
313
- raise Exception("Failed to commit changes")
314
 
315
- if not self.git.push("origin", branch_name):
316
- raise Exception("Failed to push changes")
317
 
318
- # Create a pull request
319
- pr = self.create_pull_request(
320
- owner=owner,
321
- repo=repo,
322
- title=f"Fix for issue #{issue_number}",
323
- body="This PR resolves the reported issue with the following resolution.",
324
- head=branch_name
325
- )
326
- return f"Pull request created: {pr['html_url']}"
327
- except Exception as e:
328
- logger.error(f"Error resolving issue #{issue_number}: {e}")
329
- return f"Error: {e}"
330
- finally:
331
- if self.temp_dir and os.path.exists(self.temp_dir):
332
- shutil.rmtree(self.temp_dir)
333
-
334
- def _create_resolution_document(self, issue_number: int, resolution: str) -> str:
335
- """Create a resolution document for the fix."""
336
- return f"""# Resolution for Issue #{issue_number}
337
- ## Resolution Details
338
- {resolution}
339
- ## Metadata
340
- - Date: {dt.now(timezone.utc).isoformat()}
341
- - Resolved By: Automated System
342
- """
343
-
344
- # -----------------------
345
- # 3) Build the combined Gradio interface with two tabs
346
- # -----------------------
347
 
348
- def create_combined_gradio_app():
349
- """
350
- Create one Gradio interface that has two tabs:
351
- 1) 'Scraper/Indexer/Dataset Generator'
352
- 2) 'GitHub Issue Resolver'
353
- """
354
- bot = GitHubBot(logger)
355
-
356
- # 3.1) Functions for the first tab (Scraper/Indexer/Dataset Generator)
357
- def process_workflow(command, data, files, url, depth):
358
- datasets = []
359
- errors = []
360
- try:
361
- # If user enters text in the data_input box
362
- if data:
363
- datasets.append(data)
364
-
365
- # If user uploads any files
366
- if files:
367
- for file in files:
368
- datasets.append(process_file(file))
369
-
370
- # If user supplies a URL
371
- if url:
372
- datasets.append(fetch_url(url, max_depth=depth))
373
-
374
- # Depending on the command chosen, do the logic
375
- if command == "Extract Dataset":
376
- return {"datasets": extract_dataset("\n".join(datasets))}, ""
377
- elif command == "Combine Datasets":
378
- return {"datasets": combine_datasets(datasets)}, ""
379
-
380
- # Default: if "Scrape Data" or "Train Chatbot" or unknown
381
- return {"datasets": datasets}, ""
382
- except Exception as e:
383
- errors.append(str(e))
384
- return {"datasets": []}, "\n".join(errors)
385
-
386
- # 3.2) Functions for the second tab (GitHub Issue Resolver)
387
- def on_resolve(token, repo_url, issue_number, resolution, forked_repo):
388
- """
389
- This callback is used when a user clicks 'Resolve Issue' in the second tab.
390
- """
391
- try:
392
- parts = repo_url.strip("/").split("/")
393
- # Typically, the repo URL is something like https://github.com/owner/repo
394
- owner, repo = parts[-2], parts[-1]
395
- result = bot.resolve_issue(token, owner, repo, int(issue_number), resolution, forked_repo)
396
- return result
397
- except Exception as e:
398
- logger.error(f"Error in issue resolution: {e}")
399
- return f"Error: {e}"
400
-
401
- with gr.Blocks() as main_app:
402
- # Title / Header
403
- gr.Markdown("## Combined System: Scraper/Indexer/Dataset Generator & GitHub Issue Resolver")
404
-
405
- with gr.Tab("Scraper / Indexer / Dataset Generator"):
406
- gr.Markdown(
407
- "**Use this tab to upload files, scrape data from URLs, or enter text to generate datasets.**"
408
- )
409
-
410
- # The UI from your first script
411
- chatbot = gr.Chatbot(label="Flash Trained Chatbot (Placeholder)")
412
- command_selector = gr.Dropdown(
413
- label="Select Command",
414
- choices=["Scrape Data", "Extract Dataset", "Combine Datasets", "Train Chatbot"],
415
- value="Scrape Data"
416
- )
417
- data_input = gr.Textbox(label="Input Text", placeholder="Enter text here.")
418
- file_upload = gr.Files(label="Upload Files", file_types=[".pdf", ".txt", ".zip"])
419
- url_input = gr.Textbox(label="URL", placeholder="https://example.com")
420
- depth_slider = gr.Slider(label="Crawl Depth", minimum=1, maximum=10, value=1)
421
- output_json = gr.JSON(label="Output Dataset")
422
- error_output = gr.Textbox(label="Error Log", interactive=False)
423
- process_button = gr.Button("Process")
424
-
425
- process_button.click(
426
- process_workflow,
427
- inputs=[command_selector, data_input, file_upload, url_input, depth_slider],
428
- outputs=[output_json, error_output]
429
- )
430
-
431
- with gr.Tab("GitHub Issue Resolver"):
432
- gr.Markdown("**Use this tab to resolve GitHub issues by cloning, fixing, and opening PRs.**")
433
-
434
- token_input = gr.Textbox(label="GitHub Token", placeholder="Enter your GitHub token")
435
- repo_url_input = gr.Textbox(label="Repository URL", placeholder="e.g. https://github.com/owner/repo")
436
- issue_number_input = gr.Number(label="Issue Number", precision=0, value=1)
437
- resolution_input = gr.Textbox(
438
- label="Proposed Resolution",
439
- placeholder="Describe the resolution for the issue here..."
440
- )
441
- forked_repo_input = gr.Textbox(
442
- label="Forked Repo URL",
443
- placeholder="e.g. https://github.com/youraccount/repo (your fork)"
444
- )
445
- resolve_button = gr.Button("Resolve Issue")
446
- result_output = gr.Textbox(label="Result", interactive=False)
447
-
448
- resolve_button.click(
449
- fn=on_resolve,
450
- inputs=[
451
- token_input,
452
- repo_url_input,
453
- issue_number_input,
454
- resolution_input,
455
- forked_repo_input
456
- ],
457
- outputs=[result_output]
458
- )
459
 
460
- return main_app
 
461
 
462
- if __name__ == "__main__":
463
- app = create_combined_gradio_app()
464
- app.launch(server_name="0.0.0.0", server_port=7860)
 
2
  import gradio as gr
3
  import requests
4
  import uuid
5
+ import json
6
+ from huggingface_hub import InferenceClient
7
  from pypdf import PdfReader
8
  from bs4 import BeautifulSoup
 
9
  import zipfile
 
10
  import nltk
11
+ from typing import List, Dict
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ # Ensure NLTK resources
14
  try:
15
  nltk.data.find('tokenizers/punkt')
16
  except LookupError:
17
  nltk.download('punkt')
18
 
19
+ # Initialize Hugging Face API
 
 
 
 
 
 
 
 
 
20
  HF_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
21
+ HF_TOKEN = os.environ.get("HF_TOKEN")
22
+ client = InferenceClient(model=HF_MODEL, token=HF_TOKEN)
 
23
 
24
+ # State to manage datasets
25
+ datasets_queue = []
 
 
 
 
 
26
 
27
+ # Helper Functions
28
+ def extract_text_from_pdf(file_path):
 
 
 
 
29
  try:
30
  reader = PdfReader(file_path)
31
+ return "\n".join(page.extract_text() for page in reader.pages)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  except Exception as e:
33
+ return f"Error reading PDF: {e}"
 
34
 
35
+ def extract_text_from_url(url):
 
36
  try:
37
+ response = requests.get(url, timeout=10)
38
+ response.raise_for_status()
39
+ soup = BeautifulSoup(response.content, "lxml")
40
+ return soup.get_text()
 
 
 
 
 
 
 
 
 
 
 
41
  except Exception as e:
42
+ return f"Error scraping URL: {e}"
 
43
 
44
+ def process_uploaded_file(file):
 
45
  try:
46
  if file.name.endswith(".pdf"):
47
+ return extract_text_from_pdf(file.name)
48
  elif file.name.endswith(".txt"):
49
+ with open(file.name, "r", encoding="utf-8") as f:
50
+ return f.read()
51
  elif file.name.endswith(".zip"):
52
+ extracted_data = []
53
+ with zipfile.ZipFile(file.name, "r") as zip_ref:
54
+ for file_info in zip_ref.infolist():
55
+ if file_info.filename.endswith((".pdf", ".txt")):
56
+ with zip_ref.open(file_info) as f:
57
+ content = f.read()
58
+ if file_info.filename.endswith(".txt"):
59
+ extracted_data.append(content.decode("utf-8"))
60
+ elif file_info.filename.endswith(".pdf"):
61
+ temp_path = f"/tmp/{uuid.uuid4()}"
62
+ with open(temp_path, "wb") as temp_file:
63
+ temp_file.write(content)
64
+ extracted_data.append(extract_text_from_pdf(temp_path))
65
+ return "\n".join(extracted_data)
66
  except Exception as e:
67
+ return f"Error processing file: {e}"
 
68
 
69
+ def chunk_text(text, max_chunk_size=2000):
70
+ sentences = nltk.sent_tokenize(text)
71
+ chunks, current_chunk = [], ""
 
 
 
72
  for sentence in sentences:
73
  if len(current_chunk) + len(sentence) + 1 > max_chunk_size:
74
  chunks.append(current_chunk.strip())
 
78
  chunks.append(current_chunk.strip())
79
  return chunks
80
 
81
+ def infer_dataset(data, instructions):
 
82
  extracted = []
83
+ chunks = chunk_text(data)
84
  for i, chunk in enumerate(chunks):
85
  try:
86
  response = client.text_generation(
87
  prompt=instructions.format(history=chunk),
88
+ max_new_tokens=1024
89
  )
90
  extracted.append(response["generated_text"])
91
  except Exception as e:
92
+ extracted.append(f"Error in chunk {i}: {e}")
 
93
  return "\n".join(extracted)
94
 
95
+ # Gradio Interface
96
+ def scrape_data(instructions, files, urls):
97
+ combined_data = []
98
 
99
+ # Process uploaded files
100
+ if files:
101
+ for file in files:
102
+ combined_data.append(process_uploaded_file(file))
103
 
104
+ # Process URLs
105
+ if urls:
106
+ url_list = [url.strip() for url in urls.split(",") if url.strip()]
107
+ for url in url_list:
108
+ combined_data.append(extract_text_from_url(url))
 
 
 
109
 
110
+ # Combine and infer with instructions
111
+ full_text = "\n".join(combined_data)
112
+ if instructions:
113
+ dataset = infer_dataset(full_text, instructions)
114
+ else:
115
+ dataset = full_text
 
 
 
 
 
 
 
116
 
117
+ return dataset
 
 
118
 
119
+ def add_to_queue(dataset):
120
+ datasets_queue.append(dataset)
121
+ return json.dumps(datasets_queue, indent=2)
 
 
 
 
 
122
 
123
+ def combine_datasets():
124
+ combined_data = "\n".join(datasets_queue)
125
+ combined_json = {"combined_dataset": combined_data}
126
+ combined_file = "/tmp/combined_dataset.json"
127
+ with open(combined_file, "w") as f:
128
+ json.dump(combined_json, f, indent=2)
129
+ return json.dumps(combined_json, indent=2), combined_file
 
 
130
 
131
+ def train_chatbot(dataset):
132
+ system_message = {"system": "You are a bot trained on the following dataset:"}
133
+ system_message["dataset"] = dataset
134
+ return "Chatbot trained successfully!"
 
 
 
 
 
135
 
136
+ def chat_with_bot(history, user_input):
137
+ if "dataset" not in system_message:
138
+ return history + [(user_input, "No dataset loaded for the chatbot.")]
 
 
 
 
 
 
139
 
140
+ bot_response = client.text_generation(
141
+ prompt=f"{system_message['dataset']} {user_input}",
142
+ max_new_tokens=128
143
+ )
144
+ return history + [(user_input, bot_response["generated_text"])]
 
 
 
145
 
146
+ # Gradio Interface
147
+ with gr.Blocks() as app:
148
+ gr.Markdown("# Intelligent Scraper, Dataset Handler, and Chatbot")
 
 
 
 
149
 
150
+ with gr.Tab("Scrape / Extract Data"):
151
+ gr.Markdown("Upload files or enter URLs to scrape data and generate JSON datasets.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
 
153
+ instruction_input = gr.Textbox(label="Optional Instructions", placeholder="Enter instructions for scraping.")
154
+ upload_files = gr.Files(label="Upload Files (PDF, TXT, ZIP)", file_types=[".pdf", ".txt", ".zip"])
155
+ url_input = gr.Textbox(label="Enter URLs (comma-separated or multiline)")
156
+ scrape_button = gr.Button("Scrape / Extract Data")
 
 
 
 
 
 
 
 
 
157
 
158
+ extracted_output = gr.Textbox(label="Extracted Output")
159
+ dataset_button = gr.Button("Add to Dataset Queue")
160
+ scraped_dataset = gr.Textbox(label="Current Dataset")
161
 
162
+ scrape_button.click(scrape_data, inputs=[instruction_input, upload_files, url_input], outputs=extracted_output)
163
+ dataset_button.click(add_to_queue, inputs=[extracted_output], outputs=scraped_dataset)
 
164
 
165
+ with gr.Tab("Combine Datasets"):
166
+ gr.Markdown("Combine queued datasets into a single JSON dataset.")
167
 
168
+ combine_button = gr.Button("Combine Datasets")
169
+ combined_output = gr.Textbox(label="Combined Dataset")
170
+ download_button = gr.Button("Download Combined Dataset")
171
+ download_output = gr.File(label="Download")
172
 
173
+ combine_button.click(combine_datasets, outputs=[combined_output, download_output])
 
 
174
 
175
+ with gr.Tab("Train and Chat"):
176
+ gr.Markdown("Train a chatbot with a selected dataset and interact with it.")
177
 
178
+ chat_dataset = gr.Textbox(label="Dataset for Training", placeholder="Paste or load a dataset for training.")
179
+ train_button = gr.Button("Train Chatbot")
180
+ chatbot = gr.Chatbot(label="Chat with Trained Bot")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
+ system_message = {"system": "You are a bot trained on the following dataset:"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
+ train_button.click(train_chatbot, inputs=[chat_dataset], outputs=None)
185
+ chatbot.click(chat_with_bot, inputs=[chatbot, gr.Textbox(label="User Input")], outputs=chatbot)
186
 
187
+ app.launch()