Alina Lozovskaya commited on
Commit
3119795
·
1 Parent(s): aa33679

Add kill task

Browse files
Files changed (1) hide show
  1. app.py +170 -89
app.py CHANGED
@@ -1,57 +1,95 @@
1
  import os
2
- import time
3
  import pathlib
4
- import threading
5
  import shutil
6
- import gradio as gr
7
- import yaml
8
  import io
9
-
 
10
  from loguru import logger
11
  from yourbench.pipeline import run_pipeline
12
 
13
  UPLOAD_DIRECTORY = pathlib.Path("/app/uploaded_files")
14
  UPLOAD_DIRECTORY.mkdir(parents=True, exist_ok=True)
15
-
16
  CONFIG_PATH = pathlib.Path("/app/yourbench_config.yml")
17
 
18
- yourbench_log_stream = io.StringIO()
19
-
20
- def custom_log_handler(message):
21
- yourbench_log_stream.write(message + "\n")
22
- # yourbench_log_stream.flush()
23
-
24
- def get_log_content():
25
- yourbench_log_stream.seek(0)
26
- content = yourbench_log_stream.read()
27
- print(len(content))
28
- return content
29
-
30
- logger.add(custom_log_handler, filter="yourbench")
31
-
32
- def start_task():
33
- # Start the long-running task in a separate thread
34
- task_thread = threading.Thread(target=run_pipeline, args=(CONFIG_PATH,), daemon=True)
35
- task_thread.start()
36
- task_thread.join()
37
-
38
- def generate_config(
39
- hf_token,
40
- hf_org,
41
- model_name,
42
- provider,
43
- base_url,
44
- api_key,
45
- max_concurrent_requests,
46
- ingestion_source,
47
- ingestion_output,
48
- run_ingestion,
49
- summarization_source,
50
- summarization_output,
51
- run_summarization
52
- ):
53
-
54
- """Generates a config.yaml based on user inputs"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  config = {
56
  "hf_configuration": {
57
  "token": hf_token,
@@ -65,16 +103,84 @@ def generate_config(
65
  "api_key": api_key,
66
  "max_concurrent_requests": max_concurrent_requests
67
  }],
 
 
 
 
 
68
  "pipeline": {
69
  "ingestion": {
70
- "source_documents_dir": ingestion_source,
71
- "output_dir": ingestion_output,
72
- "run": run_ingestion
 
 
 
 
 
 
73
  },
74
  "summarization": {
75
- "source_dataset_name": summarization_source,
76
- "output_dataset_name": summarization_output,
77
- "run": run_summarization
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  }
79
  }
80
  }
@@ -83,70 +189,45 @@ def generate_config(
83
  def save_config(yaml_text):
84
  with open(CONFIG_PATH, "w") as file:
85
  file.write(yaml_text)
86
- return "✅ Config saved as config.yaml!"
87
-
88
 
89
  def save_files(files: list[str]):
90
- saved_paths = []
91
- for file in files:
92
- file_path = pathlib.Path(file)
93
- save_path = UPLOAD_DIRECTORY / file_path.name
94
- shutil.move(str(file_path), str(save_path))
95
- saved_paths.append(str(save_path))
96
- return f"Files have been successfully saved to: {', '.join(saved_paths)}"
97
-
98
- def start_youbench():
99
- run_pipeline(CONFIG_PATH, debug=False)
100
 
101
  app = gr.Blocks()
102
 
103
  with app:
104
  gr.Markdown("## YourBench Configuration")
105
 
106
- with gr.Tab("HF Configuration"):
107
  hf_token = gr.Textbox(label="HF Token")
108
  hf_org = gr.Textbox(label="HF Organization")
109
-
110
- with gr.Tab("Model Settings"):
111
  model_name = gr.Textbox(label="Model Name")
112
  provider = gr.Dropdown(["openrouter", "openai", "huggingface"], value="huggingface", label="Provider")
113
  base_url = gr.Textbox(label="Base URL")
114
  api_key = gr.Textbox(label="API Key")
115
  max_concurrent_requests = gr.Dropdown([8, 16, 32], value=16, label="Max Concurrent Requests")
116
-
117
- with gr.Tab("Pipeline Stages"):
118
- ingestion_source = gr.Textbox(label="Ingestion Source Directory")
119
- ingestion_output = gr.Textbox(label="Ingestion Output Directory")
120
- run_ingestion = gr.Checkbox(label="Run Ingestion", value=False)
121
- summarization_source = gr.Textbox(label="Summarization Source Dataset")
122
- summarization_output = gr.Textbox(label="Summarization Output Dataset")
123
- run_summarization = gr.Checkbox(label="Run Summarization", value=False)
124
-
125
- with gr.Tab("Config"):
126
  config_output = gr.Code(label="Generated Config", language="yaml")
127
  preview_button = gr.Button("Generate Config")
128
  save_button = gr.Button("Save Config")
129
-
130
- preview_button.click(generate_config,
131
- inputs=[hf_token, hf_org, model_name, provider, base_url, api_key,
132
- max_concurrent_requests, ingestion_source, ingestion_output,
133
- run_ingestion, summarization_source, summarization_output, run_summarization],
134
- outputs=config_output)
135
-
136
  save_button.click(save_config, inputs=[config_output], outputs=[gr.Textbox(label="Save Status")])
137
 
138
  with gr.Tab("Files"):
139
  file_input = gr.File(label="Upload text files", file_count="multiple", file_types=[".txt", ".md", ".html"])
140
- file_explorer = gr.FileExplorer(root_dir=UPLOAD_DIRECTORY, interactive=False, label="Current Files")
141
  output = gr.Textbox(label="Log")
142
  file_input.upload(save_files, file_input, output)
143
 
144
-
145
  with gr.Tab("Run Generation"):
146
- log_output = gr.Code(label="Log Output", language=None,lines=20, interactive=False)
147
- start_button = gr.Button("Start Long-Running Task")
148
- timer = gr.Timer(0.5, active=True)
149
- timer.tick(get_log_content, outputs=log_output)
150
- start_button.click(start_task)
 
 
 
151
 
152
- app.launch()
 
1
  import os
2
+ import sys
3
  import pathlib
 
4
  import shutil
5
+ import threading
6
+ import multiprocessing
7
  import io
8
+ import yaml
9
+ import gradio as gr
10
  from loguru import logger
11
  from yourbench.pipeline import run_pipeline
12
 
13
  UPLOAD_DIRECTORY = pathlib.Path("/app/uploaded_files")
14
  UPLOAD_DIRECTORY.mkdir(parents=True, exist_ok=True)
 
15
  CONFIG_PATH = pathlib.Path("/app/yourbench_config.yml")
16
 
17
+ logger.remove()
18
+ logger.add(sys.stderr, level="INFO")
19
+
20
+ import subprocess
21
+ import io
22
+ import os
23
+ import time
24
+
25
+ class SubprocessManager:
26
+ def __init__(self, command):
27
+ self.command = command
28
+ self.process = None
29
+ self.output_stream = io.StringIO()
30
+
31
+ def start_process(self):
32
+ """Start the subprocess."""
33
+ if self.is_running():
34
+ logger.info("Process is already running")
35
+ return
36
+
37
+ self.process = subprocess.Popen(
38
+ self.command,
39
+ stdout=subprocess.PIPE,
40
+ stderr=subprocess.STDOUT, # Combine stderr with stdout
41
+ text=True,
42
+ bufsize=1, # Line-buffered
43
+ start_new_session=True # Start the process in a new session
44
+ )
45
+ os.set_blocking(self.process.stdout.fileno(), False)
46
+ logger.info("Started the process")
47
+
48
+ def read_and_get_output(self):
49
+ """Read available subprocess output and return the captured output."""
50
+ if self.process and self.process.stdout:
51
+ try:
52
+ while True:
53
+ line = self.process.stdout.readline()
54
+ if line:
55
+ self.output_stream.write(line) # Capture in StringIO
56
+ else:
57
+ break
58
+ except BlockingIOError:
59
+ pass
60
+ return self.output_stream.getvalue()
61
+
62
+ def stop_process(self):
63
+ """Terminate the subprocess."""
64
+ if not self.is_running():
65
+ logger.info("Started the process")
66
+ return
67
+ logger.info("Sending SIGTERM to the Process")
68
+ self.process.terminate()
69
+ exit_code = self.process.wait() # Wait for process to terminate
70
+ logger.info(f"Process stopped exit code {exit_code}")
71
+ #return exit_code
72
+
73
+ def kill_process(self):
74
+ """Forcefully kill the subprocess."""
75
+ if not self.is_running():
76
+ logger.info("Process is not running")
77
+ return
78
+ logger.info("Sending SIGKILL to the Process")
79
+ self.process.kill()
80
+ exit_code = self.process.wait() # Wait for process to be killed
81
+ logger.info(f"Process killed exit code {exit_code}")
82
+ #return exit_code
83
+
84
+ def is_running(self):
85
+ """Check if the subprocess is still running."""
86
+ return self.process and self.process.poll() is None
87
+
88
+
89
+ command = ["uv", "run", "yourbench", f"--config={CONFIG_PATH}"]
90
+ manager = SubprocessManager(command)
91
+
92
+ def generate_config(hf_token, hf_org, model_name, provider, base_url, api_key, max_concurrent_requests):
93
  config = {
94
  "hf_configuration": {
95
  "token": hf_token,
 
103
  "api_key": api_key,
104
  "max_concurrent_requests": max_concurrent_requests
105
  }],
106
+ "model_roles": {role: [model_name] for role in [
107
+ "ingestion", "summarization", "single_shot_question_generation",
108
+ "multi_hop_question_generation", "answer_generation", "judge_answers"
109
+ ]},
110
+ "inference_config": {"max_concurrent_requests": 16},
111
  "pipeline": {
112
  "ingestion": {
113
+ "source_documents_dir": "/app/uploaded_files",
114
+ "output_dir": "/app/ingested",
115
+ "run": True
116
+ },
117
+ "upload_ingest_to_hub": {
118
+ "source_documents_dir": "/app/ingested",
119
+ "hub_dataset_name": "test_ingested_documents",
120
+ "local_dataset_path": "/app/ingested_dataset",
121
+ "run": True
122
  },
123
  "summarization": {
124
+ "source_dataset_name": "test_ingested_documents",
125
+ "output_dataset_name": "test_summaries",
126
+ "local_dataset_path": "/results/test_summaries",
127
+ "concat_existing_dataset": False,
128
+ "run": True
129
+ },
130
+ "chunking": {
131
+ "source_dataset_name": "test_summaries",
132
+ "output_dataset_name": "test_chunked_documents",
133
+ "local_dataset_path": "/results/test_chunked_documents",
134
+ "concat_existing_dataset": False,
135
+ "chunking_configuration": {
136
+ "l_min_tokens": 64,
137
+ "l_max_tokens": 128,
138
+ "tau_threshold": 0.3,
139
+ "h_min": 2,
140
+ "h_max": 4
141
+ },
142
+ "run": True
143
+ },
144
+ "single_shot_question_generation": {
145
+ "source_dataset_name": "test_chunked_documents",
146
+ "output_dataset_name": "test_single_shot_questions",
147
+ "local_dataset_path": "/results/test_single_shot_questions",
148
+ "diversification_seed": "24 year old adult",
149
+ "concat_existing_dataset": False,
150
+ "run": True
151
+ },
152
+ "multi_hop_question_generation": {
153
+ "source_dataset_name": "test_chunked_documents",
154
+ "output_dataset_name": "test_multi_hop_questions",
155
+ "local_dataset_path": "/results/test_multi_hop_questions",
156
+ "concat_existing_dataset": False,
157
+ "run": True
158
+ },
159
+ "answer_generation": {
160
+ "run": True,
161
+ "question_dataset_name": "test_single_shot_questions",
162
+ "output_dataset_name": "test_answered_questions",
163
+ "local_dataset_path": "/results/test_answered_questions",
164
+ "concat_existing_dataset": False,
165
+ "strategies": [{
166
+ "name": "zeroshot",
167
+ "prompt": "ZEROSHOT_QA_USER_PROMPT",
168
+ "model_name": model_name
169
+ }, {
170
+ "name": "gold",
171
+ "prompt": "GOLD_QA_USER_PROMPT",
172
+ "model_name": model_name
173
+ }]
174
+ },
175
+ "judge_answers": {
176
+ "run": True,
177
+ "source_judge_dataset_name": "test_answered_questions",
178
+ "output_judged_dataset_name": "test_judged_comparisons",
179
+ "local_dataset_path": "/results/test_judged_comparisons",
180
+ "concat_existing_dataset": False,
181
+ "comparing_strategies": [["zeroshot", "gold"]],
182
+ "chunk_column_index": 0,
183
+ "random_seed": 42
184
  }
185
  }
186
  }
 
189
  def save_config(yaml_text):
190
  with open(CONFIG_PATH, "w") as file:
191
  file.write(yaml_text)
192
+ return "✅ Config saved!"
 
193
 
194
  def save_files(files: list[str]):
195
+ saved_paths = [shutil.move(str(pathlib.Path(file)), str(UPLOAD_DIRECTORY / pathlib.Path(file).name)) for file in files]
196
+ return f"Files saved to: {', '.join(saved_paths)}"
 
 
 
 
 
 
 
 
197
 
198
  app = gr.Blocks()
199
 
200
  with app:
201
  gr.Markdown("## YourBench Configuration")
202
 
203
+ with gr.Tab("Configuration"):
204
  hf_token = gr.Textbox(label="HF Token")
205
  hf_org = gr.Textbox(label="HF Organization")
 
 
206
  model_name = gr.Textbox(label="Model Name")
207
  provider = gr.Dropdown(["openrouter", "openai", "huggingface"], value="huggingface", label="Provider")
208
  base_url = gr.Textbox(label="Base URL")
209
  api_key = gr.Textbox(label="API Key")
210
  max_concurrent_requests = gr.Dropdown([8, 16, 32], value=16, label="Max Concurrent Requests")
 
 
 
 
 
 
 
 
 
 
211
  config_output = gr.Code(label="Generated Config", language="yaml")
212
  preview_button = gr.Button("Generate Config")
213
  save_button = gr.Button("Save Config")
214
+
215
+ preview_button.click(generate_config, inputs=[hf_token, hf_org, model_name, provider, base_url, api_key, max_concurrent_requests], outputs=config_output)
 
 
 
 
 
216
  save_button.click(save_config, inputs=[config_output], outputs=[gr.Textbox(label="Save Status")])
217
 
218
  with gr.Tab("Files"):
219
  file_input = gr.File(label="Upload text files", file_count="multiple", file_types=[".txt", ".md", ".html"])
 
220
  output = gr.Textbox(label="Log")
221
  file_input.upload(save_files, file_input, output)
222
 
 
223
  with gr.Tab("Run Generation"):
224
+ log_output = gr.Code(label="Log Output", language=None, lines=20, interactive=False)
225
+ start_button = gr.Button("Start Task")
226
+ start_button.click(manager.start_process)
227
+ timer = gr.Timer(0.1, active=True)
228
+ timer.tick(manager.read_and_get_output, outputs=log_output)
229
+
230
+ start_button = gr.Button("Kill Task")
231
+ start_button.click(manager.kill_process)
232
 
233
+ app.launch()