File size: 3,717 Bytes
bae4131
7ccf9d4
6454c0e
089a447
133c6d8
7ccf9d4
 
6454c0e
5289522
7ccf9d4
089a447
133c6d8
bae4131
 
 
7ccf9d4
25580aa
089a447
 
 
 
25580aa
089a447
bae4131
 
 
7ccf9d4
089a447
7ccf9d4
 
089a447
 
6454c0e
 
 
133c6d8
 
089a447
6454c0e
 
133c6d8
089a447
6454c0e
3d76e98
6454c0e
 
 
 
 
 
089a447
6454c0e
089a447
6454c0e
 
 
089a447
6454c0e
73b392f
6454c0e
3d76e98
 
bae4131
089a447
 
 
 
 
 
 
 
 
 
 
6454c0e
 
25580aa
6454c0e
 
089a447
 
 
6454c0e
bae4131
089a447
133c6d8
7ccf9d4
133c6d8
7ccf9d4
133c6d8
7ccf9d4
089a447
133c6d8
7ccf9d4
133c6d8
 
 
7ccf9d4
089a447
570d85c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import yaml
from loguru import logger


def generate_base_config(hf_org: str, hf_dataset_name: str, session_uid: str):
    """Creates the base config dictionary"""
    return {
        "hf_configuration": {
            "token": "$HF_TOKEN",
            "private": True,
            "hf_organization": hf_org,
            "hf_dataset_name": hf_dataset_name,
        },
        "model_list": [
            {
                "model_name": "meta-llama/Llama-3.3-70B-Instruct",
                "provider": "novita",
                "max_concurrent_requests": 32,
            },
            {
                "model_name": "Qwen/Qwen2.5-72B-Instruct",
                "provider": "novita",
                "max_concurrent_requests": 32,
            }
        ],
        "model_roles": {
            "ingestion": ["meta-llama/Llama-3.3-70B-Instruct"],
            "summarization": ["Qwen/Qwen2.5-72B-Instruct"],
            "single_shot_question_generation": ["meta-llama/Llama-3.3-70B-Instruct"],
            "multi_hop_question_generation": ["meta-llama/Llama-3.3-70B-Instruct"],
            "answer_generation": ["Qwen/Qwen2.5-72B-Instruct"],
            "judge_answers": ["meta-llama/Llama-3.3-70B-Instruct"],
        },
        "pipeline": {
            "ingestion": {
                "source_documents_dir": f"/app/{session_uid}/uploaded_files/",
                "output_dir": f"/app/{session_uid}/ingested",
                "run": True,
            },
            "upload_ingest_to_hub": {
                "source_documents_dir": f"/app/{session_uid}/ingested",
                "run": True,
            },
            "summarization": {"run": True},
            "chunking": {
                "chunking_configuration": {
                    "l_min_tokens": 64,
                    "l_max_tokens": 128,
                    "tau_threshold": 0.3,
                    "h_min": 2,
                    "h_max": 4,
                },
                "run": True,
            },
            "single_shot_question_generation": {
                "diversification_seed": "24 year old adult",
                "run": True,
            },
            "multi_hop_question_generation": {"run": False},
            "answer_generation": {
                "question_type": "single_shot",
                "run": True,
                "strategies": [
                    {
                        "name": "zeroshot",
                        "prompt": "ZEROSHOT_QA_USER_PROMPT",
                        "model_name": "meta-llama/Llama-3.3-70B-Instruct",
                    },
                    {
                        "name": "gold",
                        "prompt": "GOLD_QA_USER_PROMPT",
                        "model_name": "meta-llama/Llama-3.3-70B-Instruct",
                    },
                ],
            },
            "judge_answers": {
                "run": False, # to change when fixed
                "comparing_strategies": [["zeroshot", "gold"]],
                "chunk_column_index": 0,
                "random_seed": 42,
            },
        },
    }


def save_yaml_file(config: str, path: str):
    """Saves the given config dictionary to a YAML file"""
    with open(path, "w") as file:
        yaml.dump(config, file, default_flow_style=False, sort_keys=False)
    return path


def generate_and_save_config(hf_org: str, hf_name: str, session_uid: str, config_path: str):
    """Generates and saves the YAML configuration file"""
    logger.debug(f"Generating config with org: {hf_org}, dataset name: {hf_name}")
    config = generate_base_config(hf_org, hf_name, session_uid)
    file_path = save_yaml_file(config, config_path)
    logger.success(f"Config saved at: {file_path}")
    return file_path