Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -2,149 +2,73 @@ import os
|
|
2 |
import gradio as gr
|
3 |
import requests
|
4 |
import uuid
|
5 |
-
|
|
|
6 |
from pypdf import PdfReader
|
7 |
from bs4 import BeautifulSoup
|
8 |
-
import datetime
|
9 |
import zipfile
|
10 |
-
import nltk.data
|
11 |
import nltk
|
12 |
-
import
|
13 |
-
import dotenv
|
14 |
-
import yaml
|
15 |
-
from typing import Optional, Union, List, Dict, Any, Tuple
|
16 |
-
import subprocess
|
17 |
-
from pathlib import Path
|
18 |
-
import json
|
19 |
-
import tempfile
|
20 |
-
from datetime import datetime as dt, timezone
|
21 |
-
import re
|
22 |
-
import logging
|
23 |
-
import shutil
|
24 |
-
|
25 |
-
# -----------------------
|
26 |
-
# ENV / Logging Setup
|
27 |
-
# -----------------------
|
28 |
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
29 |
-
logger = logging.getLogger(__name__)
|
30 |
|
31 |
-
# Ensure
|
32 |
try:
|
33 |
nltk.data.find('tokenizers/punkt')
|
34 |
except LookupError:
|
35 |
nltk.download('punkt')
|
36 |
|
37 |
-
|
38 |
-
def log(message):
|
39 |
-
if VERBOSE:
|
40 |
-
print(f"[LOG] {datetime.datetime.now()} - {message}")
|
41 |
-
|
42 |
-
# -----------------------
|
43 |
-
# 1) Scraper/Indexer/Dataset Generator - from your first script
|
44 |
-
# -----------------------
|
45 |
-
|
46 |
-
# == Hugging Face API Setup ==
|
47 |
HF_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
48 |
-
HF_TOKEN = os.environ.get(
|
49 |
-
|
50 |
-
raise EnvironmentError("HF_TOKEN is not set. Please export it as an environment variable.")
|
51 |
|
52 |
-
|
53 |
-
|
54 |
-
api = HfApi(token=HF_TOKEN)
|
55 |
-
log("Initialized Hugging Face client and API.")
|
56 |
-
except Exception as e:
|
57 |
-
log(f"Error initializing Hugging Face client: {e}")
|
58 |
-
exit(1)
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
MAX_TOKENS = 8192
|
63 |
-
|
64 |
-
def read_pdf(file_path):
|
65 |
-
"""Read PDF and return its text."""
|
66 |
try:
|
67 |
reader = PdfReader(file_path)
|
68 |
-
|
69 |
-
return text
|
70 |
-
except Exception as e:
|
71 |
-
log(f"Error reading PDF {file_path}: {e}")
|
72 |
-
return ""
|
73 |
-
|
74 |
-
def fetch_url(url, max_depth):
|
75 |
-
"""Breadth-first search crawl to a given depth, collecting text."""
|
76 |
-
visited = set()
|
77 |
-
to_visit = [(url, 0)]
|
78 |
-
results = []
|
79 |
-
while to_visit:
|
80 |
-
current_url, depth = to_visit.pop(0)
|
81 |
-
if current_url in visited:
|
82 |
-
continue
|
83 |
-
if depth < max_depth:
|
84 |
-
try:
|
85 |
-
response = requests.get(current_url, timeout=10)
|
86 |
-
response.raise_for_status()
|
87 |
-
visited.add(current_url)
|
88 |
-
soup = BeautifulSoup(response.content, 'lxml')
|
89 |
-
results.append(soup.get_text())
|
90 |
-
for link in soup.find_all("a", href=True):
|
91 |
-
absolute_url = requests.compat.urljoin(current_url, link.get('href'))
|
92 |
-
if absolute_url.startswith("http") and absolute_url not in visited:
|
93 |
-
to_visit.append((absolute_url, depth + 1))
|
94 |
-
except Exception as e:
|
95 |
-
log(f"Error fetching {current_url}: {e}")
|
96 |
-
return "\n".join(results)
|
97 |
-
|
98 |
-
def read_txt(txt_path):
|
99 |
-
"""Read text file."""
|
100 |
-
try:
|
101 |
-
with open(txt_path, "r", encoding="utf-8") as f:
|
102 |
-
return f.read()
|
103 |
except Exception as e:
|
104 |
-
|
105 |
-
return ""
|
106 |
|
107 |
-
def
|
108 |
-
"""Read all .txt/.pdf files inside a ZIP."""
|
109 |
try:
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
with zip_ref.open(file_info) as file:
|
115 |
-
content = file.read()
|
116 |
-
if file_info.filename.endswith(".txt"):
|
117 |
-
extracted_data.append(content.decode("utf-8"))
|
118 |
-
elif file_info.filename.endswith(".pdf"):
|
119 |
-
temp_path = f"/tmp/{uuid.uuid4()}"
|
120 |
-
with open(temp_path, "wb") as temp_file:
|
121 |
-
temp_file.write(content)
|
122 |
-
extracted_data.append(read_pdf(temp_path))
|
123 |
-
os.remove(temp_path)
|
124 |
-
return "\n".join(extracted_data)
|
125 |
except Exception as e:
|
126 |
-
|
127 |
-
return ""
|
128 |
|
129 |
-
def
|
130 |
-
"""Depending on file extension, process file to extract text."""
|
131 |
try:
|
132 |
if file.name.endswith(".pdf"):
|
133 |
-
return
|
134 |
elif file.name.endswith(".txt"):
|
135 |
-
|
|
|
136 |
elif file.name.endswith(".zip"):
|
137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
except Exception as e:
|
139 |
-
|
140 |
-
return ""
|
141 |
|
142 |
-
def chunk_text(text, max_chunk_size):
|
143 |
-
|
144 |
-
|
145 |
-
sentences = tokenizer.tokenize(text)
|
146 |
-
chunks = []
|
147 |
-
current_chunk = ""
|
148 |
for sentence in sentences:
|
149 |
if len(current_chunk) + len(sentence) + 1 > max_chunk_size:
|
150 |
chunks.append(current_chunk.strip())
|
@@ -154,311 +78,110 @@ def chunk_text(text, max_chunk_size):
|
|
154 |
chunks.append(current_chunk.strip())
|
155 |
return chunks
|
156 |
|
157 |
-
def
|
158 |
-
"""Call text generation on each chunk with a certain instruction."""
|
159 |
extracted = []
|
160 |
-
chunks = chunk_text(data
|
161 |
for i, chunk in enumerate(chunks):
|
162 |
try:
|
163 |
response = client.text_generation(
|
164 |
prompt=instructions.format(history=chunk),
|
165 |
-
max_new_tokens=
|
166 |
)
|
167 |
extracted.append(response["generated_text"])
|
168 |
except Exception as e:
|
169 |
-
|
170 |
-
extracted.append(f"Error processing chunk {i+1}: {e}")
|
171 |
return "\n".join(extracted)
|
172 |
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
|
177 |
-
#
|
178 |
-
|
179 |
-
|
|
|
180 |
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
"""
|
187 |
-
if isinstance(command, str):
|
188 |
-
command = command.split()
|
189 |
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
text=True
|
197 |
-
)
|
198 |
-
stdout, stderr = process.communicate()
|
199 |
-
return stdout.strip(), stderr.strip(), process.returncode
|
200 |
-
except Exception as e:
|
201 |
-
logger.error(f"Error executing command {command}: {e}")
|
202 |
-
return "", str(e), 1
|
203 |
|
204 |
-
|
205 |
-
def __init__(self, repo_path: str):
|
206 |
-
self.repo_path = Path(repo_path)
|
207 |
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
f"git clone -b {branch} {url} {self.repo_path}"
|
212 |
-
)
|
213 |
-
if code != 0:
|
214 |
-
logger.error(f"Git clone failed: {stderr}")
|
215 |
-
return code == 0
|
216 |
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
)
|
223 |
-
|
224 |
-
logger.error(f"Git commit failed: {stderr}")
|
225 |
-
return code == 0
|
226 |
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
str(self.repo_path)
|
232 |
-
)
|
233 |
-
if code != 0:
|
234 |
-
logger.error(f"Git push failed: {stderr}")
|
235 |
-
return code == 0
|
236 |
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
["git", "checkout", "-b", branch_name],
|
241 |
-
str(self.repo_path)
|
242 |
-
)
|
243 |
-
if code != 0:
|
244 |
-
logger.error(f"Git branch creation failed: {stderr}")
|
245 |
-
return code == 0
|
246 |
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
self.git = None
|
253 |
-
self.temp_dir = None
|
254 |
-
self.base_url = "https://api.github.com"
|
255 |
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
raise ValueError("GitHub token is required.")
|
260 |
-
self.github_api = {"Authorization": f"Bearer {token}"}
|
261 |
-
self.temp_dir = tempfile.mkdtemp()
|
262 |
-
self.git = GitUtilities(self.temp_dir)
|
263 |
|
264 |
-
|
265 |
-
"
|
266 |
-
url = f"{self.base_url}/repos/{owner}/{repo}/pulls"
|
267 |
-
data = {
|
268 |
-
"title": title,
|
269 |
-
"body": body,
|
270 |
-
"head": head,
|
271 |
-
"base": base
|
272 |
-
}
|
273 |
-
try:
|
274 |
-
response = requests.post(url, headers=self.github_api, json=data)
|
275 |
-
response.raise_for_status()
|
276 |
-
return response.json()
|
277 |
-
except requests.RequestException as e:
|
278 |
-
logger.error(f"Error creating pull request: {e}")
|
279 |
-
raise
|
280 |
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
repo: str,
|
286 |
-
issue_number: int,
|
287 |
-
resolution: str,
|
288 |
-
forked_repo: str
|
289 |
-
) -> str:
|
290 |
-
"""Resolve a GitHub issue by cloning, creating a fix branch, and opening a PR."""
|
291 |
-
try:
|
292 |
-
self.initialize_api(token)
|
293 |
-
branch_name = f"fix/issue-{issue_number}-{dt.now().strftime('%Y%m%d-%H%M%S')}"
|
294 |
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
|
299 |
-
|
300 |
-
|
301 |
-
raise Exception("Failed to create branch")
|
302 |
|
303 |
-
|
304 |
-
|
305 |
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
|
311 |
-
|
312 |
-
if not self.git.commit(f"Fix for issue #{issue_number}"):
|
313 |
-
raise Exception("Failed to commit changes")
|
314 |
|
315 |
-
|
316 |
-
|
317 |
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
repo=repo,
|
322 |
-
title=f"Fix for issue #{issue_number}",
|
323 |
-
body="This PR resolves the reported issue with the following resolution.",
|
324 |
-
head=branch_name
|
325 |
-
)
|
326 |
-
return f"Pull request created: {pr['html_url']}"
|
327 |
-
except Exception as e:
|
328 |
-
logger.error(f"Error resolving issue #{issue_number}: {e}")
|
329 |
-
return f"Error: {e}"
|
330 |
-
finally:
|
331 |
-
if self.temp_dir and os.path.exists(self.temp_dir):
|
332 |
-
shutil.rmtree(self.temp_dir)
|
333 |
-
|
334 |
-
def _create_resolution_document(self, issue_number: int, resolution: str) -> str:
|
335 |
-
"""Create a resolution document for the fix."""
|
336 |
-
return f"""# Resolution for Issue #{issue_number}
|
337 |
-
## Resolution Details
|
338 |
-
{resolution}
|
339 |
-
## Metadata
|
340 |
-
- Date: {dt.now(timezone.utc).isoformat()}
|
341 |
-
- Resolved By: Automated System
|
342 |
-
"""
|
343 |
-
|
344 |
-
# -----------------------
|
345 |
-
# 3) Build the combined Gradio interface with two tabs
|
346 |
-
# -----------------------
|
347 |
|
348 |
-
|
349 |
-
"""
|
350 |
-
Create one Gradio interface that has two tabs:
|
351 |
-
1) 'Scraper/Indexer/Dataset Generator'
|
352 |
-
2) 'GitHub Issue Resolver'
|
353 |
-
"""
|
354 |
-
bot = GitHubBot(logger)
|
355 |
-
|
356 |
-
# 3.1) Functions for the first tab (Scraper/Indexer/Dataset Generator)
|
357 |
-
def process_workflow(command, data, files, url, depth):
|
358 |
-
datasets = []
|
359 |
-
errors = []
|
360 |
-
try:
|
361 |
-
# If user enters text in the data_input box
|
362 |
-
if data:
|
363 |
-
datasets.append(data)
|
364 |
-
|
365 |
-
# If user uploads any files
|
366 |
-
if files:
|
367 |
-
for file in files:
|
368 |
-
datasets.append(process_file(file))
|
369 |
-
|
370 |
-
# If user supplies a URL
|
371 |
-
if url:
|
372 |
-
datasets.append(fetch_url(url, max_depth=depth))
|
373 |
-
|
374 |
-
# Depending on the command chosen, do the logic
|
375 |
-
if command == "Extract Dataset":
|
376 |
-
return {"datasets": extract_dataset("\n".join(datasets))}, ""
|
377 |
-
elif command == "Combine Datasets":
|
378 |
-
return {"datasets": combine_datasets(datasets)}, ""
|
379 |
-
|
380 |
-
# Default: if "Scrape Data" or "Train Chatbot" or unknown
|
381 |
-
return {"datasets": datasets}, ""
|
382 |
-
except Exception as e:
|
383 |
-
errors.append(str(e))
|
384 |
-
return {"datasets": []}, "\n".join(errors)
|
385 |
-
|
386 |
-
# 3.2) Functions for the second tab (GitHub Issue Resolver)
|
387 |
-
def on_resolve(token, repo_url, issue_number, resolution, forked_repo):
|
388 |
-
"""
|
389 |
-
This callback is used when a user clicks 'Resolve Issue' in the second tab.
|
390 |
-
"""
|
391 |
-
try:
|
392 |
-
parts = repo_url.strip("/").split("/")
|
393 |
-
# Typically, the repo URL is something like https://github.com/owner/repo
|
394 |
-
owner, repo = parts[-2], parts[-1]
|
395 |
-
result = bot.resolve_issue(token, owner, repo, int(issue_number), resolution, forked_repo)
|
396 |
-
return result
|
397 |
-
except Exception as e:
|
398 |
-
logger.error(f"Error in issue resolution: {e}")
|
399 |
-
return f"Error: {e}"
|
400 |
-
|
401 |
-
with gr.Blocks() as main_app:
|
402 |
-
# Title / Header
|
403 |
-
gr.Markdown("## Combined System: Scraper/Indexer/Dataset Generator & GitHub Issue Resolver")
|
404 |
-
|
405 |
-
with gr.Tab("Scraper / Indexer / Dataset Generator"):
|
406 |
-
gr.Markdown(
|
407 |
-
"**Use this tab to upload files, scrape data from URLs, or enter text to generate datasets.**"
|
408 |
-
)
|
409 |
-
|
410 |
-
# The UI from your first script
|
411 |
-
chatbot = gr.Chatbot(label="Flash Trained Chatbot (Placeholder)")
|
412 |
-
command_selector = gr.Dropdown(
|
413 |
-
label="Select Command",
|
414 |
-
choices=["Scrape Data", "Extract Dataset", "Combine Datasets", "Train Chatbot"],
|
415 |
-
value="Scrape Data"
|
416 |
-
)
|
417 |
-
data_input = gr.Textbox(label="Input Text", placeholder="Enter text here.")
|
418 |
-
file_upload = gr.Files(label="Upload Files", file_types=[".pdf", ".txt", ".zip"])
|
419 |
-
url_input = gr.Textbox(label="URL", placeholder="https://example.com")
|
420 |
-
depth_slider = gr.Slider(label="Crawl Depth", minimum=1, maximum=10, value=1)
|
421 |
-
output_json = gr.JSON(label="Output Dataset")
|
422 |
-
error_output = gr.Textbox(label="Error Log", interactive=False)
|
423 |
-
process_button = gr.Button("Process")
|
424 |
-
|
425 |
-
process_button.click(
|
426 |
-
process_workflow,
|
427 |
-
inputs=[command_selector, data_input, file_upload, url_input, depth_slider],
|
428 |
-
outputs=[output_json, error_output]
|
429 |
-
)
|
430 |
-
|
431 |
-
with gr.Tab("GitHub Issue Resolver"):
|
432 |
-
gr.Markdown("**Use this tab to resolve GitHub issues by cloning, fixing, and opening PRs.**")
|
433 |
-
|
434 |
-
token_input = gr.Textbox(label="GitHub Token", placeholder="Enter your GitHub token")
|
435 |
-
repo_url_input = gr.Textbox(label="Repository URL", placeholder="e.g. https://github.com/owner/repo")
|
436 |
-
issue_number_input = gr.Number(label="Issue Number", precision=0, value=1)
|
437 |
-
resolution_input = gr.Textbox(
|
438 |
-
label="Proposed Resolution",
|
439 |
-
placeholder="Describe the resolution for the issue here..."
|
440 |
-
)
|
441 |
-
forked_repo_input = gr.Textbox(
|
442 |
-
label="Forked Repo URL",
|
443 |
-
placeholder="e.g. https://github.com/youraccount/repo (your fork)"
|
444 |
-
)
|
445 |
-
resolve_button = gr.Button("Resolve Issue")
|
446 |
-
result_output = gr.Textbox(label="Result", interactive=False)
|
447 |
-
|
448 |
-
resolve_button.click(
|
449 |
-
fn=on_resolve,
|
450 |
-
inputs=[
|
451 |
-
token_input,
|
452 |
-
repo_url_input,
|
453 |
-
issue_number_input,
|
454 |
-
resolution_input,
|
455 |
-
forked_repo_input
|
456 |
-
],
|
457 |
-
outputs=[result_output]
|
458 |
-
)
|
459 |
|
460 |
-
|
|
|
461 |
|
462 |
-
|
463 |
-
app = create_combined_gradio_app()
|
464 |
-
app.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
2 |
import gradio as gr
|
3 |
import requests
|
4 |
import uuid
|
5 |
+
import json
|
6 |
+
from huggingface_hub import InferenceClient
|
7 |
from pypdf import PdfReader
|
8 |
from bs4 import BeautifulSoup
|
|
|
9 |
import zipfile
|
|
|
10 |
import nltk
|
11 |
+
from typing import List, Dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
+
# Ensure NLTK resources
|
14 |
try:
|
15 |
nltk.data.find('tokenizers/punkt')
|
16 |
except LookupError:
|
17 |
nltk.download('punkt')
|
18 |
|
19 |
+
# Initialize Hugging Face API
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
HF_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
21 |
+
HF_TOKEN = os.environ.get("HF_TOKEN")
|
22 |
+
client = InferenceClient(model=HF_MODEL, token=HF_TOKEN)
|
|
|
23 |
|
24 |
+
# State to manage datasets
|
25 |
+
datasets_queue = []
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
+
# Helper Functions
|
28 |
+
def extract_text_from_pdf(file_path):
|
|
|
|
|
|
|
|
|
29 |
try:
|
30 |
reader = PdfReader(file_path)
|
31 |
+
return "\n".join(page.extract_text() for page in reader.pages)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
except Exception as e:
|
33 |
+
return f"Error reading PDF: {e}"
|
|
|
34 |
|
35 |
+
def extract_text_from_url(url):
|
|
|
36 |
try:
|
37 |
+
response = requests.get(url, timeout=10)
|
38 |
+
response.raise_for_status()
|
39 |
+
soup = BeautifulSoup(response.content, "lxml")
|
40 |
+
return soup.get_text()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
except Exception as e:
|
42 |
+
return f"Error scraping URL: {e}"
|
|
|
43 |
|
44 |
+
def process_uploaded_file(file):
|
|
|
45 |
try:
|
46 |
if file.name.endswith(".pdf"):
|
47 |
+
return extract_text_from_pdf(file.name)
|
48 |
elif file.name.endswith(".txt"):
|
49 |
+
with open(file.name, "r", encoding="utf-8") as f:
|
50 |
+
return f.read()
|
51 |
elif file.name.endswith(".zip"):
|
52 |
+
extracted_data = []
|
53 |
+
with zipfile.ZipFile(file.name, "r") as zip_ref:
|
54 |
+
for file_info in zip_ref.infolist():
|
55 |
+
if file_info.filename.endswith((".pdf", ".txt")):
|
56 |
+
with zip_ref.open(file_info) as f:
|
57 |
+
content = f.read()
|
58 |
+
if file_info.filename.endswith(".txt"):
|
59 |
+
extracted_data.append(content.decode("utf-8"))
|
60 |
+
elif file_info.filename.endswith(".pdf"):
|
61 |
+
temp_path = f"/tmp/{uuid.uuid4()}"
|
62 |
+
with open(temp_path, "wb") as temp_file:
|
63 |
+
temp_file.write(content)
|
64 |
+
extracted_data.append(extract_text_from_pdf(temp_path))
|
65 |
+
return "\n".join(extracted_data)
|
66 |
except Exception as e:
|
67 |
+
return f"Error processing file: {e}"
|
|
|
68 |
|
69 |
+
def chunk_text(text, max_chunk_size=2000):
|
70 |
+
sentences = nltk.sent_tokenize(text)
|
71 |
+
chunks, current_chunk = [], ""
|
|
|
|
|
|
|
72 |
for sentence in sentences:
|
73 |
if len(current_chunk) + len(sentence) + 1 > max_chunk_size:
|
74 |
chunks.append(current_chunk.strip())
|
|
|
78 |
chunks.append(current_chunk.strip())
|
79 |
return chunks
|
80 |
|
81 |
+
def infer_dataset(data, instructions):
|
|
|
82 |
extracted = []
|
83 |
+
chunks = chunk_text(data)
|
84 |
for i, chunk in enumerate(chunks):
|
85 |
try:
|
86 |
response = client.text_generation(
|
87 |
prompt=instructions.format(history=chunk),
|
88 |
+
max_new_tokens=1024
|
89 |
)
|
90 |
extracted.append(response["generated_text"])
|
91 |
except Exception as e:
|
92 |
+
extracted.append(f"Error in chunk {i}: {e}")
|
|
|
93 |
return "\n".join(extracted)
|
94 |
|
95 |
+
# Gradio Interface
|
96 |
+
def scrape_data(instructions, files, urls):
|
97 |
+
combined_data = []
|
98 |
|
99 |
+
# Process uploaded files
|
100 |
+
if files:
|
101 |
+
for file in files:
|
102 |
+
combined_data.append(process_uploaded_file(file))
|
103 |
|
104 |
+
# Process URLs
|
105 |
+
if urls:
|
106 |
+
url_list = [url.strip() for url in urls.split(",") if url.strip()]
|
107 |
+
for url in url_list:
|
108 |
+
combined_data.append(extract_text_from_url(url))
|
|
|
|
|
|
|
109 |
|
110 |
+
# Combine and infer with instructions
|
111 |
+
full_text = "\n".join(combined_data)
|
112 |
+
if instructions:
|
113 |
+
dataset = infer_dataset(full_text, instructions)
|
114 |
+
else:
|
115 |
+
dataset = full_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
+
return dataset
|
|
|
|
|
118 |
|
119 |
+
def add_to_queue(dataset):
|
120 |
+
datasets_queue.append(dataset)
|
121 |
+
return json.dumps(datasets_queue, indent=2)
|
|
|
|
|
|
|
|
|
|
|
122 |
|
123 |
+
def combine_datasets():
|
124 |
+
combined_data = "\n".join(datasets_queue)
|
125 |
+
combined_json = {"combined_dataset": combined_data}
|
126 |
+
combined_file = "/tmp/combined_dataset.json"
|
127 |
+
with open(combined_file, "w") as f:
|
128 |
+
json.dump(combined_json, f, indent=2)
|
129 |
+
return json.dumps(combined_json, indent=2), combined_file
|
|
|
|
|
130 |
|
131 |
+
def train_chatbot(dataset):
|
132 |
+
system_message = {"system": "You are a bot trained on the following dataset:"}
|
133 |
+
system_message["dataset"] = dataset
|
134 |
+
return "Chatbot trained successfully!"
|
|
|
|
|
|
|
|
|
|
|
135 |
|
136 |
+
def chat_with_bot(history, user_input):
|
137 |
+
if "dataset" not in system_message:
|
138 |
+
return history + [(user_input, "No dataset loaded for the chatbot.")]
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
|
140 |
+
bot_response = client.text_generation(
|
141 |
+
prompt=f"{system_message['dataset']} {user_input}",
|
142 |
+
max_new_tokens=128
|
143 |
+
)
|
144 |
+
return history + [(user_input, bot_response["generated_text"])]
|
|
|
|
|
|
|
145 |
|
146 |
+
# Gradio Interface
|
147 |
+
with gr.Blocks() as app:
|
148 |
+
gr.Markdown("# Intelligent Scraper, Dataset Handler, and Chatbot")
|
|
|
|
|
|
|
|
|
149 |
|
150 |
+
with gr.Tab("Scrape / Extract Data"):
|
151 |
+
gr.Markdown("Upload files or enter URLs to scrape data and generate JSON datasets.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
|
153 |
+
instruction_input = gr.Textbox(label="Optional Instructions", placeholder="Enter instructions for scraping.")
|
154 |
+
upload_files = gr.Files(label="Upload Files (PDF, TXT, ZIP)", file_types=[".pdf", ".txt", ".zip"])
|
155 |
+
url_input = gr.Textbox(label="Enter URLs (comma-separated or multiline)")
|
156 |
+
scrape_button = gr.Button("Scrape / Extract Data")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
|
158 |
+
extracted_output = gr.Textbox(label="Extracted Output")
|
159 |
+
dataset_button = gr.Button("Add to Dataset Queue")
|
160 |
+
scraped_dataset = gr.Textbox(label="Current Dataset")
|
161 |
|
162 |
+
scrape_button.click(scrape_data, inputs=[instruction_input, upload_files, url_input], outputs=extracted_output)
|
163 |
+
dataset_button.click(add_to_queue, inputs=[extracted_output], outputs=scraped_dataset)
|
|
|
164 |
|
165 |
+
with gr.Tab("Combine Datasets"):
|
166 |
+
gr.Markdown("Combine queued datasets into a single JSON dataset.")
|
167 |
|
168 |
+
combine_button = gr.Button("Combine Datasets")
|
169 |
+
combined_output = gr.Textbox(label="Combined Dataset")
|
170 |
+
download_button = gr.Button("Download Combined Dataset")
|
171 |
+
download_output = gr.File(label="Download")
|
172 |
|
173 |
+
combine_button.click(combine_datasets, outputs=[combined_output, download_output])
|
|
|
|
|
174 |
|
175 |
+
with gr.Tab("Train and Chat"):
|
176 |
+
gr.Markdown("Train a chatbot with a selected dataset and interact with it.")
|
177 |
|
178 |
+
chat_dataset = gr.Textbox(label="Dataset for Training", placeholder="Paste or load a dataset for training.")
|
179 |
+
train_button = gr.Button("Train Chatbot")
|
180 |
+
chatbot = gr.Chatbot(label="Chat with Trained Bot")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
|
182 |
+
system_message = {"system": "You are a bot trained on the following dataset:"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
|
184 |
+
train_button.click(train_chatbot, inputs=[chat_dataset], outputs=None)
|
185 |
+
chatbot.click(chat_with_bot, inputs=[chatbot, gr.Textbox(label="User Input")], outputs=chatbot)
|
186 |
|
187 |
+
app.launch()
|
|
|
|