advanced

Build error

File size: 4,564 Bytes


import yaml
from yourbench_space.utils import CONFIG_PATH



def generate_base_config(
    hf_org,
    model_name,
    provider,
    base_url,
    model_api_key,
    max_concurrent_requests,
    hf_dataset_prefix,
    private_dataset,
    ingestion_model,
    summarization_model,
    single_shot_question_generation_model,
    multi_hop_question_generation_model,
    answer_generation_model,
    judge_answers_model,
):
    config = {
        "hf_configuration": {
            "token": "$HF_TOKEN",
            "private": private_dataset,
            "hf_organization": hf_org,
        },
        "model_list": [
            {
                "model_name": model_name,
                "provider": provider,
                "base_url": base_url,
                "api_key": "$MODEL_API_KEY",
                "max_concurrent_requests": max_concurrent_requests,
            }
        ],
        "model_roles": {
            role: [model_name]
            for role in [
                "ingestion",
                "summarization",
                "single_shot_question_generation",
                "multi_hop_question_generation",
                "answer_generation",
                "judge_answers",
            ]
        },
        "inference_config": {"max_concurrent_requests": 16},
        "pipeline": {
            "ingestion": {
                "source_documents_dir": "/app/uploaded_files",
                "output_dir": "/app/ingested",
                "run": True,
            },
            "upload_ingest_to_hub": {
                "source_documents_dir": "/app/ingested",
                "hub_dataset_name": f"{hf_dataset_prefix}_ingested_documents",
                "run": True,
            },
            "summarization": {
                "source_dataset_name": f"{hf_dataset_prefix}_ingested_documents",
                "output_dataset_name": f"{hf_dataset_prefix}_summaries",
                "concat_existing_dataset": False,
                "run": True,
            },
            "chunking": {
                "source_dataset_name": f"{hf_dataset_prefix}_summaries",
                "output_dataset_name": f"{hf_dataset_prefix}_chunked_documents",
                "concat_existing_dataset": False,
                "chunking_configuration": {
                    "l_min_tokens": 64,
                    "l_max_tokens": 128,
                    "tau_threshold": 0.3,
                    "h_min": 2,
                    "h_max": 4,
                },
                "run": True,
            },
            "single_shot_question_generation": {
                "source_dataset_name": f"{hf_dataset_prefix}_chunked_documents",
                "output_dataset_name": f"{hf_dataset_prefix}_single_shot_questions",
                "diversification_seed": "24 year old adult",
                "concat_existing_dataset": False,
                "run": True,
            },
            "multi_hop_question_generation": {
                "source_dataset_name": f"{hf_dataset_prefix}_chunked_documents",
                "output_dataset_name": f"{hf_dataset_prefix}_multi_hop_questions",
                "concat_existing_dataset": False,
                "run": True,
            },
            "answer_generation": {
                "question_dataset_name": f"{hf_dataset_prefix}_single_shot_questions",
                "output_dataset_name": f"{hf_dataset_prefix}_answered_questions",
                "concat_existing_dataset": False,
                "strategies": [
                    {
                        "name": "zeroshot",
                        "prompt": "ZEROSHOT_QA_USER_PROMPT",
                        "model_name": model_name,
                    },
                    {
                        "name": "gold",
                        "prompt": "GOLD_QA_USER_PROMPT",
                        "model_name": model_name,
                    },
                ],
                "run": True,
            },
            "judge_answers": {
                "source_judge_dataset_name": f"{hf_dataset_prefix}_answered_questions",
                "output_judged_dataset_name": f"{hf_dataset_prefix}_judged_comparisons",
                "concat_existing_dataset": False,
                "comparing_strategies": [["zeroshot", "gold"]],
                "chunk_column_index": 0,
                "random_seed": 42,
                "run": True,
            },
        },
    }
    return yaml.dump(config, sort_keys=False)


def save_config(yaml_text):
    with open(CONFIG_PATH, "w") as file:
        file.write(yaml_text)
    return "✅ Config saved!"