Spaces:

acecalisto3
/

GitBot

Runtime error

App Files Files Community

acecalisto3 commited on Dec 30, 2024

Commit

43cf74c

verified ·

1 Parent(s): 5cb59c2

Update app.py

Browse files

Files changed (1) hide show

app.py +116 -393

app.py CHANGED Viewed

@@ -2,149 +2,73 @@ import os
 import gradio as gr
 import requests
 import uuid
-from huggingface_hub import InferenceClient, HfApi
 from pypdf import PdfReader
 from bs4 import BeautifulSoup
-import datetime
 import zipfile
-import nltk.data
 import nltk
-import langchain
-import dotenv
-import yaml
-from typing import Optional, Union, List, Dict, Any, Tuple
-import subprocess
-from pathlib import Path
-import json
-import tempfile
-from datetime import datetime as dt, timezone
-import re
-import logging
-import shutil
-# -----------------------
-# ENV / Logging Setup
-# -----------------------
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
-logger = logging.getLogger(__name__)
-# Ensure the 'punkt' tokenizer is downloaded only if missing
 try:
     nltk.data.find('tokenizers/punkt')
 except LookupError:
     nltk.download('punkt')
-VERBOSE = True
-def log(message):
-    if VERBOSE:
-        print(f"[LOG] {datetime.datetime.now()} - {message}")
-# -----------------------
-# 1) Scraper/Indexer/Dataset Generator - from your first script
-# -----------------------
-# == Hugging Face API Setup ==
 HF_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
-HF_TOKEN = os.environ.get('HF_TOKEN')
-if not HF_TOKEN:
-    raise EnvironmentError("HF_TOKEN is not set. Please export it as an environment variable.")
-try:
-    client = InferenceClient(HF_MODEL)
-    api = HfApi(token=HF_TOKEN)
-    log("Initialized Hugging Face client and API.")
-except Exception as e:
-    log(f"Error initializing Hugging Face client: {e}")
-    exit(1)
-REPO_NAME = "acecalisto3/tmp"
-DATASET_URL = f"https://huggingface.co/datasets/{REPO_NAME}/raw/main/"
-MAX_TOKENS = 8192
-def read_pdf(file_path):
-    """Read PDF and return its text."""
     try:
         reader = PdfReader(file_path)
-        text = "\n".join(page.extract_text() for page in reader.pages)
-        return text
-    except Exception as e:
-        log(f"Error reading PDF {file_path}: {e}")
-        return ""
-def fetch_url(url, max_depth):
-    """Breadth-first search crawl to a given depth, collecting text."""
-    visited = set()
-    to_visit = [(url, 0)]
-    results = []
-    while to_visit:
-        current_url, depth = to_visit.pop(0)
-        if current_url in visited:
-            continue
-        if depth < max_depth:
-            try:
-                response = requests.get(current_url, timeout=10)
-                response.raise_for_status()
-                visited.add(current_url)
-                soup = BeautifulSoup(response.content, 'lxml')
-                results.append(soup.get_text())
-                for link in soup.find_all("a", href=True):
-                    absolute_url = requests.compat.urljoin(current_url, link.get('href'))
-                    if absolute_url.startswith("http") and absolute_url not in visited:
-                        to_visit.append((absolute_url, depth + 1))
-            except Exception as e:
-                log(f"Error fetching {current_url}: {e}")
-    return "\n".join(results)
-def read_txt(txt_path):
-    """Read text file."""
-    try:
-        with open(txt_path, "r", encoding="utf-8") as f:
-            return f.read()
     except Exception as e:
-        log(f"Error reading TXT file {txt_path}: {e}")
-        return ""
-def read_zip(zip_path):
-    """Read all .txt/.pdf files inside a ZIP."""
     try:
-        extracted_data = []
-        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-            for file_info in zip_ref.infolist():
-                if file_info.filename.endswith((".txt", ".pdf")):
-                    with zip_ref.open(file_info) as file:
-                        content = file.read()
-                        if file_info.filename.endswith(".txt"):
-                            extracted_data.append(content.decode("utf-8"))
-                        elif file_info.filename.endswith(".pdf"):
-                            temp_path = f"/tmp/{uuid.uuid4()}"
-                            with open(temp_path, "wb") as temp_file:
-                                temp_file.write(content)
-                            extracted_data.append(read_pdf(temp_path))
-                            os.remove(temp_path)
-        return "\n".join(extracted_data)
     except Exception as e:
-        log(f"Error reading ZIP file {zip_path}: {e}")
-        return ""
-def process_file(file):
-    """Depending on file extension, process file to extract text."""
     try:
         if file.name.endswith(".pdf"):
-            return read_pdf(file.name)
         elif file.name.endswith(".txt"):
-            return read_txt(file.name)
         elif file.name.endswith(".zip"):
-            return read_zip(file.name)
     except Exception as e:
-        log(f"Error processing file {file.name}: {e}")
-        return ""
-def chunk_text(text, max_chunk_size):
-    """Naive chunking based on sentence tokenizer to avoid huge tokens."""
-    tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
-    sentences = tokenizer.tokenize(text)
-    chunks = []
-    current_chunk = ""
     for sentence in sentences:
         if len(current_chunk) + len(sentence) + 1 > max_chunk_size:
             chunks.append(current_chunk.strip())
@@ -154,311 +78,110 @@ def chunk_text(text, max_chunk_size):
         chunks.append(current_chunk.strip())
     return chunks
-def extract_dataset(data, instructions="Extract {history}", max_tokens=MAX_TOKENS):
-    """Call text generation on each chunk with a certain instruction."""
     extracted = []
-    chunks = chunk_text(data, 20000)  # Adjust chunk size as needed
     for i, chunk in enumerate(chunks):
         try:
             response = client.text_generation(
                 prompt=instructions.format(history=chunk),
-                max_new_tokens=max_tokens
             )
             extracted.append(response["generated_text"])
         except Exception as e:
-            log(f"Error processing chunk {i+1}: {e}")
-            extracted.append(f"Error processing chunk {i+1}: {e}")
     return "\n".join(extracted)
-def combine_datasets(datasets):
-    """Simply combine multiple dataset strings into one big string."""
-    return "\n".join(datasets)
-# -----------------------
-# 2) GitHub Issue Resolver - from your second script
-# -----------------------
-class TerminalCommand:
-    @staticmethod
-    def execute(command: Union[str, List[str]], cwd: Optional[str] = None) -> Tuple[str, str, int]:
-        """
-        Execute a terminal command and return stdout, stderr, and return code
-        """
-        if isinstance(command, str):
-            command = command.split()
-        try:
-            process = subprocess.Popen(
-                command,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-                cwd=cwd,
-                text=True
-            )
-            stdout, stderr = process.communicate()
-            return stdout.strip(), stderr.strip(), process.returncode
-        except Exception as e:
-            logger.error(f"Error executing command {command}: {e}")
-            return "", str(e), 1
-class GitUtilities:
-    def __init__(self, repo_path: str):
-        self.repo_path = Path(repo_path)
-    def clone(self, url: str, branch: str = "main") -> bool:
-        """Clone a repository."""
-        stdout, stderr, code = TerminalCommand.execute(
-            f"git clone -b {branch} {url} {self.repo_path}"
-        )
-        if code != 0:
-            logger.error(f"Git clone failed: {stderr}")
-        return code == 0
-    def commit(self, message: str) -> bool:
-        """Create a commit with the given message."""
-        stdout, stderr, code = TerminalCommand.execute(
-            ["git", "commit", "-am", message],
-            str(self.repo_path)
-        )
-        if code != 0:
-            logger.error(f"Git commit failed: {stderr}")
-        return code == 0
-    def push(self, remote: str = "origin", branch: str = "main") -> bool:
-        """Push changes to remote."""
-        stdout, stderr, code = TerminalCommand.execute(
-            ["git", "push", remote, branch],
-            str(self.repo_path)
-        )
-        if code != 0:
-            logger.error(f"Git push failed: {stderr}")
-        return code == 0
-    def create_branch(self, branch_name: str) -> bool:
-        """Create and checkout a new branch."""
-        stdout, stderr, code = TerminalCommand.execute(
-            ["git", "checkout", "-b", branch_name],
-            str(self.repo_path)
-        )
-        if code != 0:
-            logger.error(f"Git branch creation failed: {stderr}")
-        return code == 0
-class GitHubBot:
-    def __init__(self, logger: logging.Logger):
-        self.github_api = None
-        self.logger = logger
-        self.ai_provider = None
-        self.git = None
-        self.temp_dir = None
-        self.base_url = "https://api.github.com"
-    def initialize_api(self, token: str):
-        """Initialize the GitHub API with a token."""
-        if not token:
-            raise ValueError("GitHub token is required.")
-        self.github_api = {"Authorization": f"Bearer {token}"}
-        self.temp_dir = tempfile.mkdtemp()
-        self.git = GitUtilities(self.temp_dir)
-    def create_pull_request(self, owner: str, repo: str, title: str, body: str, head: str, base: str = "main") -> Dict:
-        """Create a pull request."""
-        url = f"{self.base_url}/repos/{owner}/{repo}/pulls"
-        data = {
-            "title": title,
-            "body": body,
-            "head": head,
-            "base": base
-        }
-        try:
-            response = requests.post(url, headers=self.github_api, json=data)
-            response.raise_for_status()
-            return response.json()
-        except requests.RequestException as e:
-            logger.error(f"Error creating pull request: {e}")
-            raise
-    def resolve_issue(
-        self,
-        token: str,
-        owner: str,
-        repo: str,
-        issue_number: int,
-        resolution: str,
-        forked_repo: str
-    ) -> str:
-        """Resolve a GitHub issue by cloning, creating a fix branch, and opening a PR."""
-        try:
-            self.initialize_api(token)
-            branch_name = f"fix/issue-{issue_number}-{dt.now().strftime('%Y%m%d-%H%M%S')}"
-            # Clone repository (forked repo URL is expected)
-            if not self.git.clone(forked_repo):
-                raise Exception("Failed to clone repository")
-            # Create a new branch
-            if not self.git.create_branch(branch_name):
-                raise Exception("Failed to create branch")
-            # Generate resolution content
-            resolution_content = self._create_resolution_document(issue_number, resolution)
-            # Save resolution file (as an example, you can adjust)
-            resolution_path = Path(self.temp_dir) / f"resolution_{issue_number}.md"
-            with open(resolution_path, "w") as f:
-                f.write(resolution_content)
-            # Commit and push changes
-            if not self.git.commit(f"Fix for issue #{issue_number}"):
-                raise Exception("Failed to commit changes")
-            if not self.git.push("origin", branch_name):
-                raise Exception("Failed to push changes")
-            # Create a pull request
-            pr = self.create_pull_request(
-                owner=owner,
-                repo=repo,
-                title=f"Fix for issue #{issue_number}",
-                body="This PR resolves the reported issue with the following resolution.",
-                head=branch_name
-            )
-            return f"Pull request created: {pr['html_url']}"
-        except Exception as e:
-            logger.error(f"Error resolving issue #{issue_number}: {e}")
-            return f"Error: {e}"
-        finally:
-            if self.temp_dir and os.path.exists(self.temp_dir):
-                shutil.rmtree(self.temp_dir)
-    def _create_resolution_document(self, issue_number: int, resolution: str) -> str:
-        """Create a resolution document for the fix."""
-        return f"""# Resolution for Issue #{issue_number}
-## Resolution Details
-{resolution}
-## Metadata
-- Date: {dt.now(timezone.utc).isoformat()}
-- Resolved By: Automated System
-"""
-# -----------------------
-# 3) Build the combined Gradio interface with two tabs
-# -----------------------
-def create_combined_gradio_app():
-    """
-    Create one Gradio interface that has two tabs:
-    1) 'Scraper/Indexer/Dataset Generator'
-    2) 'GitHub Issue Resolver'
-    """
-    bot = GitHubBot(logger)
-    # 3.1) Functions for the first tab (Scraper/Indexer/Dataset Generator)
-    def process_workflow(command, data, files, url, depth):
-        datasets = []
-        errors = []
-        try:
-            # If user enters text in the data_input box
-            if data:
-                datasets.append(data)
-            # If user uploads any files
-            if files:
-                for file in files:
-                    datasets.append(process_file(file))
-            # If user supplies a URL
-            if url:
-                datasets.append(fetch_url(url, max_depth=depth))
-            # Depending on the command chosen, do the logic
-            if command == "Extract Dataset":
-                return {"datasets": extract_dataset("\n".join(datasets))}, ""
-            elif command == "Combine Datasets":
-                return {"datasets": combine_datasets(datasets)}, ""
-            # Default: if "Scrape Data" or "Train Chatbot" or unknown
-            return {"datasets": datasets}, ""
-        except Exception as e:
-            errors.append(str(e))
-            return {"datasets": []}, "\n".join(errors)
-    # 3.2) Functions for the second tab (GitHub Issue Resolver)
-    def on_resolve(token, repo_url, issue_number, resolution, forked_repo):
-        """
-        This callback is used when a user clicks 'Resolve Issue' in the second tab.
-        """
-        try:
-            parts = repo_url.strip("/").split("/")
-            # Typically, the repo URL is something like https://github.com/owner/repo
-            owner, repo = parts[-2], parts[-1]
-            result = bot.resolve_issue(token, owner, repo, int(issue_number), resolution, forked_repo)
-            return result
-        except Exception as e:
-            logger.error(f"Error in issue resolution: {e}")
-            return f"Error: {e}"
-    with gr.Blocks() as main_app:
-        # Title / Header
-        gr.Markdown("## Combined System: Scraper/Indexer/Dataset Generator & GitHub Issue Resolver")
-        with gr.Tab("Scraper / Indexer / Dataset Generator"):
-            gr.Markdown(
-                "**Use this tab to upload files, scrape data from URLs, or enter text to generate datasets.**"
-            )
-            # The UI from your first script
-            chatbot = gr.Chatbot(label="Flash Trained Chatbot (Placeholder)")
-            command_selector = gr.Dropdown(
-                label="Select Command",
-                choices=["Scrape Data", "Extract Dataset", "Combine Datasets", "Train Chatbot"],
-                value="Scrape Data"
-            )
-            data_input = gr.Textbox(label="Input Text", placeholder="Enter text here.")
-            file_upload = gr.Files(label="Upload Files", file_types=[".pdf", ".txt", ".zip"])
-            url_input = gr.Textbox(label="URL", placeholder="https://example.com")
-            depth_slider = gr.Slider(label="Crawl Depth", minimum=1, maximum=10, value=1)
-            output_json = gr.JSON(label="Output Dataset")
-            error_output = gr.Textbox(label="Error Log", interactive=False)
-            process_button = gr.Button("Process")
-            process_button.click(
-                process_workflow,
-                inputs=[command_selector, data_input, file_upload, url_input, depth_slider],
-                outputs=[output_json, error_output]
-            )
-        with gr.Tab("GitHub Issue Resolver"):
-            gr.Markdown("**Use this tab to resolve GitHub issues by cloning, fixing, and opening PRs.**")
-            token_input = gr.Textbox(label="GitHub Token", placeholder="Enter your GitHub token")
-            repo_url_input = gr.Textbox(label="Repository URL", placeholder="e.g. https://github.com/owner/repo")
-            issue_number_input = gr.Number(label="Issue Number", precision=0, value=1)
-            resolution_input = gr.Textbox(
-                label="Proposed Resolution",
-                placeholder="Describe the resolution for the issue here..."
-            )
-            forked_repo_input = gr.Textbox(
-                label="Forked Repo URL",
-                placeholder="e.g. https://github.com/youraccount/repo (your fork)"
-            )
-            resolve_button = gr.Button("Resolve Issue")
-            result_output = gr.Textbox(label="Result", interactive=False)
-            resolve_button.click(
-                fn=on_resolve,
-                inputs=[
-                    token_input,
-                    repo_url_input,
-                    issue_number_input,
-                    resolution_input,
-                    forked_repo_input
-                ],
-                outputs=[result_output]
-            )
-    return main_app
-if __name__ == "__main__":
-    app = create_combined_gradio_app()
-    app.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
 import requests
 import uuid
+import json
+from huggingface_hub import InferenceClient
 from pypdf import PdfReader
 from bs4 import BeautifulSoup
 import zipfile
 import nltk
+from typing import List, Dict
+# Ensure NLTK resources
 try:
     nltk.data.find('tokenizers/punkt')
 except LookupError:
     nltk.download('punkt')
+# Initialize Hugging Face API
 HF_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
+HF_TOKEN = os.environ.get("HF_TOKEN")
+client = InferenceClient(model=HF_MODEL, token=HF_TOKEN)
+# State to manage datasets
+datasets_queue = []
+# Helper Functions
+def extract_text_from_pdf(file_path):
     try:
         reader = PdfReader(file_path)
+        return "\n".join(page.extract_text() for page in reader.pages)
     except Exception as e:
+        return f"Error reading PDF: {e}"
+def extract_text_from_url(url):
     try:
+        response = requests.get(url, timeout=10)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.content, "lxml")
+        return soup.get_text()
     except Exception as e:
+        return f"Error scraping URL: {e}"
+def process_uploaded_file(file):
     try:
         if file.name.endswith(".pdf"):
+            return extract_text_from_pdf(file.name)
         elif file.name.endswith(".txt"):
+            with open(file.name, "r", encoding="utf-8") as f:
+                return f.read()
         elif file.name.endswith(".zip"):
+            extracted_data = []
+            with zipfile.ZipFile(file.name, "r") as zip_ref:
+                for file_info in zip_ref.infolist():
+                    if file_info.filename.endswith((".pdf", ".txt")):
+                        with zip_ref.open(file_info) as f:
+                            content = f.read()
+                            if file_info.filename.endswith(".txt"):
+                                extracted_data.append(content.decode("utf-8"))
+                            elif file_info.filename.endswith(".pdf"):
+                                temp_path = f"/tmp/{uuid.uuid4()}"
+                                with open(temp_path, "wb") as temp_file:
+                                    temp_file.write(content)
+                                extracted_data.append(extract_text_from_pdf(temp_path))
+            return "\n".join(extracted_data)
     except Exception as e:
+        return f"Error processing file: {e}"
+def chunk_text(text, max_chunk_size=2000):
+    sentences = nltk.sent_tokenize(text)
+    chunks, current_chunk = [], ""
     for sentence in sentences:
         if len(current_chunk) + len(sentence) + 1 > max_chunk_size:
             chunks.append(current_chunk.strip())
         chunks.append(current_chunk.strip())
     return chunks
+def infer_dataset(data, instructions):
     extracted = []
+    chunks = chunk_text(data)
     for i, chunk in enumerate(chunks):
         try:
             response = client.text_generation(
                 prompt=instructions.format(history=chunk),
+                max_new_tokens=1024
             )
             extracted.append(response["generated_text"])
         except Exception as e:
+            extracted.append(f"Error in chunk {i}: {e}")
     return "\n".join(extracted)
+# Gradio Interface
+def scrape_data(instructions, files, urls):
+    combined_data = []
+    # Process uploaded files
+    if files:
+        for file in files:
+            combined_data.append(process_uploaded_file(file))
+    # Process URLs
+    if urls:
+        url_list = [url.strip() for url in urls.split(",") if url.strip()]
+        for url in url_list:
+            combined_data.append(extract_text_from_url(url))
+    # Combine and infer with instructions
+    full_text = "\n".join(combined_data)
+    if instructions:
+        dataset = infer_dataset(full_text, instructions)
+    else:
+        dataset = full_text
+    return dataset
+def add_to_queue(dataset):
+    datasets_queue.append(dataset)
+    return json.dumps(datasets_queue, indent=2)
+def combine_datasets():
+    combined_data = "\n".join(datasets_queue)
+    combined_json = {"combined_dataset": combined_data}
+    combined_file = "/tmp/combined_dataset.json"
+    with open(combined_file, "w") as f:
+        json.dump(combined_json, f, indent=2)
+    return json.dumps(combined_json, indent=2), combined_file
+def train_chatbot(dataset):
+    system_message = {"system": "You are a bot trained on the following dataset:"}
+    system_message["dataset"] = dataset
+    return "Chatbot trained successfully!"
+def chat_with_bot(history, user_input):
+    if "dataset" not in system_message:
+        return history + [(user_input, "No dataset loaded for the chatbot.")]
+    bot_response = client.text_generation(
+        prompt=f"{system_message['dataset']} {user_input}",
+        max_new_tokens=128
+    )
+    return history + [(user_input, bot_response["generated_text"])]
+# Gradio Interface
+with gr.Blocks() as app:
+    gr.Markdown("# Intelligent Scraper, Dataset Handler, and Chatbot")
+    with gr.Tab("Scrape / Extract Data"):
+        gr.Markdown("Upload files or enter URLs to scrape data and generate JSON datasets.")
+        instruction_input = gr.Textbox(label="Optional Instructions", placeholder="Enter instructions for scraping.")
+        upload_files = gr.Files(label="Upload Files (PDF, TXT, ZIP)", file_types=[".pdf", ".txt", ".zip"])
+        url_input = gr.Textbox(label="Enter URLs (comma-separated or multiline)")
+        scrape_button = gr.Button("Scrape / Extract Data")
+        extracted_output = gr.Textbox(label="Extracted Output")
+        dataset_button = gr.Button("Add to Dataset Queue")
+        scraped_dataset = gr.Textbox(label="Current Dataset")
+        scrape_button.click(scrape_data, inputs=[instruction_input, upload_files, url_input], outputs=extracted_output)
+        dataset_button.click(add_to_queue, inputs=[extracted_output], outputs=scraped_dataset)
+    with gr.Tab("Combine Datasets"):
+        gr.Markdown("Combine queued datasets into a single JSON dataset.")
+        combine_button = gr.Button("Combine Datasets")
+        combined_output = gr.Textbox(label="Combined Dataset")
+        download_button = gr.Button("Download Combined Dataset")
+        download_output = gr.File(label="Download")
+        combine_button.click(combine_datasets, outputs=[combined_output, download_output])
+    with gr.Tab("Train and Chat"):
+        gr.Markdown("Train a chatbot with a selected dataset and interact with it.")
+        chat_dataset = gr.Textbox(label="Dataset for Training", placeholder="Paste or load a dataset for training.")
+        train_button = gr.Button("Train Chatbot")
+        chatbot = gr.Chatbot(label="Chat with Trained Bot")
+        system_message = {"system": "You are a bot trained on the following dataset:"}
+        train_button.click(train_chatbot, inputs=[chat_dataset], outputs=None)
+        chatbot.click(chat_with_bot, inputs=[chatbot, gr.Textbox(label="User Input")], outputs=chatbot)
+app.launch()