advanced

Running on CPU Upgrade

App Files Files Community

Alina Lozovskaya commited on Mar 6

Commit

3119795

1 Parent(s): aa33679

Add kill task

Browse files

Files changed (1) hide show

app.py +170 -89

app.py CHANGED Viewed

@@ -1,57 +1,95 @@
 import os
-import time
 import pathlib
-import threading
 import shutil
-import gradio as gr
-import yaml
 import io
 from loguru import logger
 from yourbench.pipeline import run_pipeline
 UPLOAD_DIRECTORY = pathlib.Path("/app/uploaded_files")
 UPLOAD_DIRECTORY.mkdir(parents=True, exist_ok=True)
 CONFIG_PATH = pathlib.Path("/app/yourbench_config.yml")
-yourbench_log_stream = io.StringIO()
-def custom_log_handler(message):
-    yourbench_log_stream.write(message + "\n")
-    # yourbench_log_stream.flush()
-def get_log_content():
-    yourbench_log_stream.seek(0)
-    content = yourbench_log_stream.read()
-    print(len(content))
-    return content
-logger.add(custom_log_handler, filter="yourbench")
-def start_task():
-    # Start the long-running task in a separate thread
-    task_thread = threading.Thread(target=run_pipeline, args=(CONFIG_PATH,), daemon=True)
-    task_thread.start()
-    task_thread.join()
-def generate_config(
-        hf_token,
-        hf_org,
-        model_name,
-        provider,
-        base_url,
-        api_key,
-        max_concurrent_requests,
-        ingestion_source,
-        ingestion_output,
-        run_ingestion,
-        summarization_source,
-        summarization_output,
-        run_summarization
-    ):
-    """Generates a config.yaml based on user inputs"""
     config = {
         "hf_configuration": {
             "token": hf_token,
@@ -65,16 +103,84 @@ def generate_config(
             "api_key": api_key,
             "max_concurrent_requests": max_concurrent_requests
         }],
         "pipeline": {
             "ingestion": {
-                "source_documents_dir": ingestion_source,
-                "output_dir": ingestion_output,
-                "run": run_ingestion
             },
             "summarization": {
-                "source_dataset_name": summarization_source,
-                "output_dataset_name": summarization_output,
-                "run": run_summarization
             }
         }
     }
@@ -83,70 +189,45 @@ def generate_config(
 def save_config(yaml_text):
     with open(CONFIG_PATH, "w") as file:
         file.write(yaml_text)
-    return "✅ Config saved as config.yaml!"
 def save_files(files: list[str]):
-    saved_paths = []
-    for file in files:
-        file_path = pathlib.Path(file)
-        save_path = UPLOAD_DIRECTORY / file_path.name
-        shutil.move(str(file_path), str(save_path))
-        saved_paths.append(str(save_path))
-    return f"Files have been successfully saved to: {', '.join(saved_paths)}"
-def start_youbench():
-    run_pipeline(CONFIG_PATH, debug=False)
 app = gr.Blocks()
 with app:
     gr.Markdown("## YourBench Configuration")
-    with gr.Tab("HF Configuration"):
         hf_token = gr.Textbox(label="HF Token")
         hf_org = gr.Textbox(label="HF Organization")
-    with gr.Tab("Model Settings"):
         model_name = gr.Textbox(label="Model Name")
         provider = gr.Dropdown(["openrouter", "openai", "huggingface"], value="huggingface", label="Provider")
         base_url = gr.Textbox(label="Base URL")
         api_key = gr.Textbox(label="API Key")
         max_concurrent_requests = gr.Dropdown([8, 16, 32], value=16, label="Max Concurrent Requests")
-    with gr.Tab("Pipeline Stages"):
-        ingestion_source = gr.Textbox(label="Ingestion Source Directory")
-        ingestion_output = gr.Textbox(label="Ingestion Output Directory")
-        run_ingestion = gr.Checkbox(label="Run Ingestion", value=False)
-        summarization_source = gr.Textbox(label="Summarization Source Dataset")
-        summarization_output = gr.Textbox(label="Summarization Output Dataset")
-        run_summarization = gr.Checkbox(label="Run Summarization", value=False)
-    with gr.Tab("Config"):
         config_output = gr.Code(label="Generated Config", language="yaml")
         preview_button = gr.Button("Generate Config")
         save_button = gr.Button("Save Config")
-        preview_button.click(generate_config,
-                            inputs=[hf_token, hf_org, model_name, provider, base_url, api_key,
-                                    max_concurrent_requests, ingestion_source, ingestion_output,
-                                    run_ingestion, summarization_source, summarization_output, run_summarization],
-                            outputs=config_output)
         save_button.click(save_config, inputs=[config_output], outputs=[gr.Textbox(label="Save Status")])
     with gr.Tab("Files"):
         file_input = gr.File(label="Upload text files", file_count="multiple", file_types=[".txt", ".md", ".html"])
-        file_explorer = gr.FileExplorer(root_dir=UPLOAD_DIRECTORY, interactive=False, label="Current Files")
         output = gr.Textbox(label="Log")
         file_input.upload(save_files, file_input, output)
     with gr.Tab("Run Generation"):
-        log_output = gr.Code(label="Log Output", language=None,lines=20, interactive=False)
-        start_button = gr.Button("Start Long-Running Task")
-        timer = gr.Timer(0.5, active=True)
-        timer.tick(get_log_content, outputs=log_output)
-        start_button.click(start_task)
-app.launch()

 import os
+import sys
 import pathlib
 import shutil
+import threading
+import multiprocessing
 import io
+import yaml
+import gradio as gr
 from loguru import logger
 from yourbench.pipeline import run_pipeline
 UPLOAD_DIRECTORY = pathlib.Path("/app/uploaded_files")
 UPLOAD_DIRECTORY.mkdir(parents=True, exist_ok=True)
 CONFIG_PATH = pathlib.Path("/app/yourbench_config.yml")
+logger.remove()
+logger.add(sys.stderr, level="INFO")
+import subprocess
+import io
+import os
+import time
+class SubprocessManager:
+    def __init__(self, command):
+        self.command = command
+        self.process = None
+        self.output_stream = io.StringIO()
+    def start_process(self):
+        """Start the subprocess."""
+        if self.is_running():
+            logger.info("Process is already running")
+            return
+        self.process = subprocess.Popen(
+            self.command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,  # Combine stderr with stdout
+            text=True,
+            bufsize=1,  # Line-buffered
+            start_new_session=True  # Start the process in a new session
+        )
+        os.set_blocking(self.process.stdout.fileno(), False)
+        logger.info("Started the process")
+    def read_and_get_output(self):
+        """Read available subprocess output and return the captured output."""
+        if self.process and self.process.stdout:
+            try:
+                while True:
+                    line = self.process.stdout.readline()
+                    if line:
+                        self.output_stream.write(line)  # Capture in StringIO
+                    else:
+                        break
+            except BlockingIOError:
+                pass
+        return self.output_stream.getvalue()
+    def stop_process(self):
+        """Terminate the subprocess."""
+        if not self.is_running():
+            logger.info("Started the process")
+            return
+        logger.info("Sending SIGTERM to the Process")
+        self.process.terminate()
+        exit_code = self.process.wait() # Wait for process to terminate
+        logger.info(f"Process stopped exit code {exit_code}")
+        #return exit_code
+    def kill_process(self):
+        """Forcefully kill the subprocess."""
+        if not self.is_running():
+            logger.info("Process is not running")
+            return
+        logger.info("Sending SIGKILL to the Process")
+        self.process.kill()
+        exit_code = self.process.wait() # Wait for process to be killed
+        logger.info(f"Process killed exit code {exit_code}")
+        #return exit_code
+    def is_running(self):
+        """Check if the subprocess is still running."""
+        return self.process and self.process.poll() is None
+command = ["uv", "run", "yourbench", f"--config={CONFIG_PATH}"]
+manager = SubprocessManager(command)
+def generate_config(hf_token, hf_org, model_name, provider, base_url, api_key, max_concurrent_requests):
     config = {
         "hf_configuration": {
             "token": hf_token,
             "api_key": api_key,
             "max_concurrent_requests": max_concurrent_requests
         }],
+        "model_roles": {role: [model_name] for role in [
+            "ingestion", "summarization", "single_shot_question_generation",
+            "multi_hop_question_generation", "answer_generation", "judge_answers"
+        ]},
+        "inference_config": {"max_concurrent_requests": 16},
         "pipeline": {
             "ingestion": {
+                "source_documents_dir": "/app/uploaded_files",
+                "output_dir": "/app/ingested",
+                "run": True
+            },
+            "upload_ingest_to_hub": {
+                "source_documents_dir": "/app/ingested",
+                "hub_dataset_name": "test_ingested_documents",
+                "local_dataset_path": "/app/ingested_dataset",
+                "run": True
             },
             "summarization": {
+                "source_dataset_name": "test_ingested_documents",
+                "output_dataset_name": "test_summaries",
+                "local_dataset_path": "/results/test_summaries",
+                "concat_existing_dataset": False,
+                "run": True
+            },
+            "chunking": {
+                "source_dataset_name": "test_summaries",
+                "output_dataset_name": "test_chunked_documents",
+                "local_dataset_path": "/results/test_chunked_documents",
+                "concat_existing_dataset": False,
+                "chunking_configuration": {
+                    "l_min_tokens": 64,
+                    "l_max_tokens": 128,
+                    "tau_threshold": 0.3,
+                    "h_min": 2,
+                    "h_max": 4
+                },
+                "run": True
+            },
+            "single_shot_question_generation": {
+                "source_dataset_name": "test_chunked_documents",
+                "output_dataset_name": "test_single_shot_questions",
+                "local_dataset_path": "/results/test_single_shot_questions",
+                "diversification_seed": "24 year old adult",
+                "concat_existing_dataset": False,
+                "run": True
+            },
+            "multi_hop_question_generation": {
+                "source_dataset_name": "test_chunked_documents",
+                "output_dataset_name": "test_multi_hop_questions",
+                "local_dataset_path": "/results/test_multi_hop_questions",
+                "concat_existing_dataset": False,
+                "run": True
+            },
+            "answer_generation": {
+                "run": True,
+                "question_dataset_name": "test_single_shot_questions",
+                "output_dataset_name": "test_answered_questions",
+                "local_dataset_path": "/results/test_answered_questions",
+                "concat_existing_dataset": False,
+                "strategies": [{
+                    "name": "zeroshot",
+                    "prompt": "ZEROSHOT_QA_USER_PROMPT",
+                    "model_name": model_name
+                }, {
+                    "name": "gold",
+                    "prompt": "GOLD_QA_USER_PROMPT",
+                    "model_name": model_name
+                }]
+            },
+            "judge_answers": {
+                "run": True,
+                "source_judge_dataset_name": "test_answered_questions",
+                "output_judged_dataset_name": "test_judged_comparisons",
+                "local_dataset_path": "/results/test_judged_comparisons",
+                "concat_existing_dataset": False,
+                "comparing_strategies": [["zeroshot", "gold"]],
+                "chunk_column_index": 0,
+                "random_seed": 42
             }
         }
     }
 def save_config(yaml_text):
     with open(CONFIG_PATH, "w") as file:
         file.write(yaml_text)
+    return "✅ Config saved!"
 def save_files(files: list[str]):
+    saved_paths = [shutil.move(str(pathlib.Path(file)), str(UPLOAD_DIRECTORY / pathlib.Path(file).name)) for file in files]
+    return f"Files saved to: {', '.join(saved_paths)}"
 app = gr.Blocks()
 with app:
     gr.Markdown("## YourBench Configuration")
+    with gr.Tab("Configuration"):
         hf_token = gr.Textbox(label="HF Token")
         hf_org = gr.Textbox(label="HF Organization")
         model_name = gr.Textbox(label="Model Name")
         provider = gr.Dropdown(["openrouter", "openai", "huggingface"], value="huggingface", label="Provider")
         base_url = gr.Textbox(label="Base URL")
         api_key = gr.Textbox(label="API Key")
         max_concurrent_requests = gr.Dropdown([8, 16, 32], value=16, label="Max Concurrent Requests")
         config_output = gr.Code(label="Generated Config", language="yaml")
         preview_button = gr.Button("Generate Config")
         save_button = gr.Button("Save Config")
+        preview_button.click(generate_config, inputs=[hf_token, hf_org, model_name, provider, base_url, api_key, max_concurrent_requests], outputs=config_output)
         save_button.click(save_config, inputs=[config_output], outputs=[gr.Textbox(label="Save Status")])
     with gr.Tab("Files"):
         file_input = gr.File(label="Upload text files", file_count="multiple", file_types=[".txt", ".md", ".html"])
         output = gr.Textbox(label="Log")
         file_input.upload(save_files, file_input, output)
     with gr.Tab("Run Generation"):
+        log_output = gr.Code(label="Log Output", language=None, lines=20, interactive=False)
+        start_button = gr.Button("Start Task")
+        start_button.click(manager.start_process)
+        timer = gr.Timer(0.1, active=True)
+        timer.tick(manager.read_and_get_output, outputs=log_output)
+        start_button = gr.Button("Kill Task")
+        start_button.click(manager.kill_process)
+app.launch()