Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Alina Lozovskaya
commited on
Commit
·
d1ed69b
1
Parent(s):
4111351
First commit
Browse files- .gitignore +45 -0
- Dockerfile +30 -0
- app.py +152 -0
- pyproject.toml +26 -0
- uv.lock +0 -0
.gitignore
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
logs/
|
6 |
+
log_backups/
|
7 |
+
plots/
|
8 |
+
files/
|
9 |
+
.gradio/
|
10 |
+
fr.sh
|
11 |
+
repo_contents.txt
|
12 |
+
# C extensions
|
13 |
+
*.so
|
14 |
+
|
15 |
+
# Distribution / packaging
|
16 |
+
.Python
|
17 |
+
build/
|
18 |
+
develop-eggs/
|
19 |
+
dist/
|
20 |
+
downloads/
|
21 |
+
eggs/
|
22 |
+
.eggs/
|
23 |
+
lib/
|
24 |
+
lib64/
|
25 |
+
parts/
|
26 |
+
sdist/
|
27 |
+
var/
|
28 |
+
wheels/
|
29 |
+
share/python-wheels/
|
30 |
+
*.egg-info/
|
31 |
+
.installed.cfg
|
32 |
+
*.egg
|
33 |
+
MANIFEST
|
34 |
+
|
35 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
36 |
+
__pypackages__/
|
37 |
+
|
38 |
+
# Environments
|
39 |
+
.env
|
40 |
+
.venv
|
41 |
+
env/
|
42 |
+
venv/
|
43 |
+
ENV/
|
44 |
+
env.bak/
|
45 |
+
venv.bak/
|
Dockerfile
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use Python 3.12.1 slim image as base
|
2 |
+
FROM python:3.12.1-slim
|
3 |
+
|
4 |
+
# Install dependencies required for UV and Python packages
|
5 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
6 |
+
curl ca-certificates git && \
|
7 |
+
rm -rf /var/lib/apt/lists/*
|
8 |
+
|
9 |
+
# Install UV (fast Python dependency manager)
|
10 |
+
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
|
11 |
+
|
12 |
+
# Ensure UV is available in PATH
|
13 |
+
ENV PATH="/root/.local/bin:$PATH"
|
14 |
+
|
15 |
+
# Set working directory
|
16 |
+
WORKDIR /app
|
17 |
+
|
18 |
+
# Copy pyproject and install dependencies using UV
|
19 |
+
COPY pyproject.toml .
|
20 |
+
RUN uv venv && uv sync
|
21 |
+
|
22 |
+
# Copy application code
|
23 |
+
COPY app.py .
|
24 |
+
|
25 |
+
# Expose Gradio app port
|
26 |
+
EXPOSE 7860
|
27 |
+
ENV GRADIO_SERVER_NAME="0.0.0.0"
|
28 |
+
|
29 |
+
# Entrypoint to run the Gradio app
|
30 |
+
ENTRYPOINT ["uv", "run", "python", "app.py"]
|
app.py
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import time
|
3 |
+
import pathlib
|
4 |
+
import threading
|
5 |
+
import shutil
|
6 |
+
import gradio as gr
|
7 |
+
import yaml
|
8 |
+
import io
|
9 |
+
|
10 |
+
from loguru import logger
|
11 |
+
from yourbench.pipeline import run_pipeline
|
12 |
+
|
13 |
+
UPLOAD_DIRECTORY = pathlib.Path("/app/uploaded_files")
|
14 |
+
UPLOAD_DIRECTORY.mkdir(parents=True, exist_ok=True)
|
15 |
+
|
16 |
+
CONFIG_PATH = pathlib.Path("/app/yourbench_config.yml")
|
17 |
+
|
18 |
+
yourbench_log_stream = io.StringIO()
|
19 |
+
|
20 |
+
def custom_log_handler(message):
|
21 |
+
yourbench_log_stream.write(message + "\n")
|
22 |
+
# yourbench_log_stream.flush()
|
23 |
+
|
24 |
+
def get_log_content():
|
25 |
+
yourbench_log_stream.seek(0)
|
26 |
+
content = yourbench_log_stream.read()
|
27 |
+
print(len(content))
|
28 |
+
return content
|
29 |
+
|
30 |
+
logger.add(custom_log_handler, filter="yourbench")
|
31 |
+
|
32 |
+
def start_task():
|
33 |
+
# Start the long-running task in a separate thread
|
34 |
+
task_thread = threading.Thread(target=run_pipeline, args=(CONFIG_PATH,), daemon=True)
|
35 |
+
task_thread.start()
|
36 |
+
task_thread.join()
|
37 |
+
|
38 |
+
def generate_config(
|
39 |
+
hf_token,
|
40 |
+
hf_org,
|
41 |
+
model_name,
|
42 |
+
provider,
|
43 |
+
base_url,
|
44 |
+
api_key,
|
45 |
+
max_concurrent_requests,
|
46 |
+
ingestion_source,
|
47 |
+
ingestion_output,
|
48 |
+
run_ingestion,
|
49 |
+
summarization_source,
|
50 |
+
summarization_output,
|
51 |
+
run_summarization
|
52 |
+
):
|
53 |
+
|
54 |
+
"""Generates a config.yaml based on user inputs"""
|
55 |
+
config = {
|
56 |
+
"hf_configuration": {
|
57 |
+
"token": hf_token,
|
58 |
+
"private": True,
|
59 |
+
"hf_organization": hf_org
|
60 |
+
},
|
61 |
+
"model_list": [{
|
62 |
+
"model_name": model_name,
|
63 |
+
"provider": provider,
|
64 |
+
"base_url": base_url,
|
65 |
+
"api_key": api_key,
|
66 |
+
"max_concurrent_requests": max_concurrent_requests
|
67 |
+
}],
|
68 |
+
"pipeline": {
|
69 |
+
"ingestion": {
|
70 |
+
"source_documents_dir": ingestion_source,
|
71 |
+
"output_dir": ingestion_output,
|
72 |
+
"run": run_ingestion
|
73 |
+
},
|
74 |
+
"summarization": {
|
75 |
+
"source_dataset_name": summarization_source,
|
76 |
+
"output_dataset_name": summarization_output,
|
77 |
+
"run": run_summarization
|
78 |
+
}
|
79 |
+
}
|
80 |
+
}
|
81 |
+
return yaml.dump(config, default_flow_style=False)
|
82 |
+
|
83 |
+
def save_config(yaml_text):
|
84 |
+
with open(CONFIG_PATH, "w") as file:
|
85 |
+
file.write(yaml_text)
|
86 |
+
return "✅ Config saved as config.yaml!"
|
87 |
+
|
88 |
+
|
89 |
+
def save_files(files: list[str]):
|
90 |
+
saved_paths = []
|
91 |
+
for file in files:
|
92 |
+
file_path = pathlib.Path(file)
|
93 |
+
save_path = UPLOAD_DIRECTORY / file_path.name
|
94 |
+
shutil.move(str(file_path), str(save_path))
|
95 |
+
saved_paths.append(str(save_path))
|
96 |
+
return f"Files have been successfully saved to: {', '.join(saved_paths)}"
|
97 |
+
|
98 |
+
def start_youbench():
|
99 |
+
run_pipeline(CONFIG_PATH, debug=False)
|
100 |
+
|
101 |
+
app = gr.Blocks()
|
102 |
+
|
103 |
+
with app:
|
104 |
+
gr.Markdown("## YourBench Configuration")
|
105 |
+
|
106 |
+
with gr.Tab("HF Configuration"):
|
107 |
+
hf_token = gr.Textbox(label="HF Token")
|
108 |
+
hf_org = gr.Textbox(label="HF Organization")
|
109 |
+
|
110 |
+
with gr.Tab("Model Settings"):
|
111 |
+
model_name = gr.Textbox(label="Model Name")
|
112 |
+
provider = gr.Dropdown(["openrouter", "openai", "huggingface"], value="huggingface", label="Provider")
|
113 |
+
base_url = gr.Textbox(label="Base URL")
|
114 |
+
api_key = gr.Textbox(label="API Key")
|
115 |
+
max_concurrent_requests = gr.Dropdown([8, 16, 32], value=16, label="Max Concurrent Requests")
|
116 |
+
|
117 |
+
with gr.Tab("Pipeline Stages"):
|
118 |
+
ingestion_source = gr.Textbox(label="Ingestion Source Directory")
|
119 |
+
ingestion_output = gr.Textbox(label="Ingestion Output Directory")
|
120 |
+
run_ingestion = gr.Checkbox(label="Run Ingestion", value=False)
|
121 |
+
summarization_source = gr.Textbox(label="Summarization Source Dataset")
|
122 |
+
summarization_output = gr.Textbox(label="Summarization Output Dataset")
|
123 |
+
run_summarization = gr.Checkbox(label="Run Summarization", value=False)
|
124 |
+
|
125 |
+
with gr.Tab("Config"):
|
126 |
+
config_output = gr.Code(label="Generated Config", language="yaml")
|
127 |
+
preview_button = gr.Button("Generate Config")
|
128 |
+
save_button = gr.Button("Save Config")
|
129 |
+
|
130 |
+
preview_button.click(generate_config,
|
131 |
+
inputs=[hf_token, hf_org, model_name, provider, base_url, api_key,
|
132 |
+
max_concurrent_requests, ingestion_source, ingestion_output,
|
133 |
+
run_ingestion, summarization_source, summarization_output, run_summarization],
|
134 |
+
outputs=config_output)
|
135 |
+
|
136 |
+
save_button.click(save_config, inputs=[config_output], outputs=[gr.Textbox(label="Save Status")])
|
137 |
+
|
138 |
+
with gr.Tab("Files"):
|
139 |
+
file_input = gr.File(label="Upload text files", file_count="multiple", file_types=[".txt", ".md", ".html"])
|
140 |
+
file_explorer = gr.FileExplorer(root_dir=UPLOAD_DIRECTORY, interactive=False, label="Current Files")
|
141 |
+
output = gr.Textbox(label="Log")
|
142 |
+
file_input.upload(save_files, file_input, output)
|
143 |
+
|
144 |
+
|
145 |
+
with gr.Tab("Run Generation"):
|
146 |
+
log_output = gr.Code(label="Log Output", language=None,lines=20, interactive=False)
|
147 |
+
start_button = gr.Button("Start Long-Running Task")
|
148 |
+
timer = gr.Timer(0.5, active=True)
|
149 |
+
timer.tick(get_log_content, outputs=log_output)
|
150 |
+
start_button.click(start_task)
|
151 |
+
|
152 |
+
app.launch()
|
pyproject.toml
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[project]
|
2 |
+
name = "yourbench-space"
|
3 |
+
version = "0.1.0"
|
4 |
+
requires-python = ">=3.12, <3.13"
|
5 |
+
|
6 |
+
dependencies = [
|
7 |
+
"yourbench @ git+https://github.com/huggingface/[email protected]",
|
8 |
+
"asyncio>=3.4.3",
|
9 |
+
"datasets>=3.3.0",
|
10 |
+
"gradio>=5.20.0",
|
11 |
+
"hf-transfer>=0.1.9",
|
12 |
+
"langfuse>=2.59.3",
|
13 |
+
"litellm>=1.61.16",
|
14 |
+
"loguru>=0.7.3",
|
15 |
+
"markitdown>=0.0.1a4",
|
16 |
+
"matplotlib>=3.10.0",
|
17 |
+
"openai>=1.63.0",
|
18 |
+
"python-dotenv>=1.0.1",
|
19 |
+
"torch>=2.6.0",
|
20 |
+
"tqdm>=4.67.1",
|
21 |
+
"transformers>=4.48.3",
|
22 |
+
]
|
23 |
+
|
24 |
+
[build-system]
|
25 |
+
requires = ["setuptools>=61.0"]
|
26 |
+
build-backend = "setuptools.build_meta"
|
uv.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|