File size: 4,303 Bytes
e7abd9e
 
 
 
58582d3
e7abd9e
 
 
b134c4d
e7abd9e
 
 
 
 
 
 
 
 
 
58582d3
e7abd9e
 
 
 
bbc2ce3
 
 
4668301
f6ae68c
e7abd9e
 
 
 
 
 
f6ae68c
58582d3
 
 
 
 
 
e7abd9e
 
 
 
58582d3
e7abd9e
 
 
 
 
 
 
58582d3
e7abd9e
 
58582d3
e7abd9e
 
 
 
 
58582d3
e7abd9e
 
58582d3
e7abd9e
 
 
 
 
 
 
58582d3
e7abd9e
 
 
 
 
 
58582d3
e7abd9e
58582d3
 
e7abd9e
 
 
 
 
 
58582d3
e7abd9e
 
 
58582d3
e7abd9e
 
 
 
 
 
 
 
58582d3
e7abd9e
 
 
 
 
 
 
58582d3
e7abd9e
 
 
 
 
58582d3
e7abd9e
 
 
 
 
58582d3
e7abd9e
 
 
58582d3
e7abd9e
58582d3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import os
import tempfile
import logging
from pathlib import Path
from huggingface_hub import HfApi, snapshot_download, create_repo
from dotenv import load_dotenv

# Configure source and destination usernames
SOURCE_USERNAME = "stacklok"
DESTINATION_USERNAME = "tfrere"

# Get the backend directory path
BACKEND_DIR = Path(__file__).parent.parent
ROOT_DIR = BACKEND_DIR.parent

# Load environment variables from .env file in root directory
load_dotenv(ROOT_DIR / ".env")

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(message)s")
logger = logging.getLogger(__name__)

# List of dataset names to sync
DATASET_NAMES = [
    "llm-security-leaderboard-votes",
    "llm-security-leaderboard-requests",
    "llm-security-leaderboard-results",
    "llm-security-leaderboard-contents",
    # "llm-security-leaderboard-official-providers",
]

# Build list of datasets with their source and destination paths
DATASETS = [
    (name, f"{SOURCE_USERNAME}/{name}", f"{DESTINATION_USERNAME}/{name}")
    for name in DATASET_NAMES
] + [
    (
        "official-providers",
        "open-llm-leaderboard/official-providers",
        f"{DESTINATION_USERNAME}/official-providers",
    )
]

# Initialize Hugging Face API
api = HfApi()


def ensure_repo_exists(repo_id, token):
    """Ensure the repository exists, create it if it doesn't"""
    try:
        api.repo_info(repo_id=repo_id, repo_type="dataset")
        logger.info(f"βœ“ Repository {repo_id} already exists")
    except Exception:
        logger.info(f"Creating repository {repo_id}...")
        create_repo(repo_id=repo_id, repo_type="dataset", token=token, private=True)
        logger.info(f"βœ“ Repository {repo_id} created")


def process_dataset(dataset_info, token):
    """Process a single dataset"""
    name, source_dataset, destination_dataset = dataset_info
    try:
        logger.info(f"\nπŸ“₯ Processing dataset: {name}")

        # Ensure destination repository exists
        ensure_repo_exists(destination_dataset, token)

        # Create a temporary directory for this dataset
        with tempfile.TemporaryDirectory() as temp_dir:
            try:
                # List files in source dataset
                logger.info(f"Listing files in {source_dataset}...")
                files = api.list_repo_files(source_dataset, repo_type="dataset")
                logger.info(f"Detected structure: {len(files)} files")

                # Download dataset
                logger.info(f"Downloading from {source_dataset}...")
                local_dir = snapshot_download(
                    repo_id=source_dataset,
                    repo_type="dataset",
                    local_dir=temp_dir,
                    token=token,
                )
                logger.info("βœ“ Download complete")

                # Upload to destination while preserving structure
                logger.info(f"πŸ“€ Uploading to {destination_dataset}...")
                api.upload_folder(
                    folder_path=local_dir,
                    repo_id=destination_dataset,
                    repo_type="dataset",
                    token=token,
                )
                logger.info(f"βœ… {name} copied successfully!")
                return True

            except Exception as e:
                logger.error(f"❌ Error processing {name}: {str(e)}")
                return False

    except Exception as e:
        logger.error(f"❌ Error for {name}: {str(e)}")
        return False


def copy_datasets():
    try:
        logger.info("πŸ”‘ Checking authentication...")
        # Get token from .env file
        token = os.getenv("HF_TOKEN")
        if not token:
            raise ValueError("HF_TOKEN not found in .env file")

        # Process datasets sequentially
        results = []
        for dataset_info in DATASETS:
            success = process_dataset(dataset_info, token)
            results.append((dataset_info[0], success))

        # Print final summary
        logger.info("\nπŸ“Š Final summary:")
        for dataset, success in results:
            status = "βœ… Success" if success else "❌ Failure"
            logger.info(f"{dataset}: {status}")

    except Exception as e:
        logger.error(f"❌ Global error: {str(e)}")


if __name__ == "__main__":
    copy_datasets()