|
from huggingface_hub import HfApi, snapshot_download |
|
from loguru import logger |
|
|
|
api = HfApi() |
|
|
|
|
|
def download_dataset_snapshot(repo_id, local_dir): |
|
try: |
|
logger.info(f"Downloading dataset snapshot from {repo_id} to {local_dir}") |
|
snapshot_download( |
|
repo_id=repo_id, |
|
local_dir=local_dir, |
|
repo_type="dataset", |
|
tqdm_class=None, |
|
) |
|
except Exception as e: |
|
logger.error(f"Error downloading dataset snapshot from {repo_id} to {local_dir}: {e}. Restarting space.") |
|
api.restart_space(repo_id=repo_id) |
|
|
|
|
|
def remove_files_from_dataset_repo(repo_id: str, path_patterns: list[str], commit_message: str = "Remove files"): |
|
""" |
|
Remove files or directories matching specified patterns from a Hugging Face dataset repository. |
|
|
|
Args: |
|
repo_id: The ID of the dataset repository (e.g., "username/dataset-name") |
|
path_patterns: List of file or directory path patterns to remove |
|
commit_message: Message for the commit that removes the files |
|
""" |
|
import fnmatch |
|
import os |
|
|
|
|
|
repo_files = api.list_repo_files(repo_id=repo_id, repo_type="dataset") |
|
|
|
|
|
files_to_remove = [] |
|
for pattern in path_patterns: |
|
matching_files = fnmatch.filter(repo_files, pattern) |
|
files_to_remove.extend(matching_files) |
|
|
|
|
|
for path in files_to_remove: |
|
try: |
|
api.delete_file( |
|
path_in_repo=path, repo_id=repo_id, repo_type="dataset", commit_message=f"{commit_message}: {path}" |
|
) |
|
print(f"Successfully removed {path} from {repo_id}") |
|
except Exception as e: |
|
print(f"Error removing {path}: {e}") |
|
|
|
|
|
def update_dataset_info_readme( |
|
repo_id: str, |
|
dataset_info: dict, |
|
license_id: str = None, |
|
commit_message: str = "Update dataset_info in README.md", |
|
): |
|
""" |
|
Update the dataset_info section in the README.md file of a Hugging Face dataset repository. |
|
|
|
Args: |
|
repo_id: The ID of the dataset repository (e.g., "username/dataset-name") |
|
dataset_info: Dictionary containing dataset information to include in the README |
|
license_id: Optional license identifier (e.g., "mit", "cc-by-4.0") |
|
commit_message: Message for the commit |
|
|
|
Example dataset_info structure: |
|
{ |
|
"features": [ |
|
{"name": "text", "dtype": "string"}, |
|
{"name": "label", "dtype": "int64"} |
|
], |
|
"splits": [ |
|
{"name": "train", "num_examples": 10000, "num_bytes": 1000000}, |
|
{"name": "test", "num_examples": 1000, "num_bytes": 100000} |
|
], |
|
"download_size": 1200000, |
|
"dataset_size": 1100000, |
|
"configs": [ |
|
{ |
|
"config_name": "default", |
|
"data_files": [ |
|
{"split": "train", "path": "data/train.csv"}, |
|
{"split": "test", "path": "data/test.csv"} |
|
] |
|
} |
|
] |
|
} |
|
""" |
|
import re |
|
|
|
import yaml |
|
from huggingface_hub import HfApi |
|
|
|
api = HfApi() |
|
|
|
|
|
try: |
|
readme_content = api.hf_hub_download(repo_id=repo_id, repo_type="dataset", filename="README.md", token=None) |
|
with open(readme_content, "r", encoding="utf-8") as f: |
|
content = f.read() |
|
except Exception: |
|
|
|
content = "" |
|
|
|
|
|
yaml_block = None |
|
yaml_match = re.search(r"---\s*\n(.*?)\n\s*---", content, re.DOTALL) |
|
|
|
if yaml_match: |
|
yaml_text = yaml_match.group(1) |
|
try: |
|
yaml_block = yaml.safe_load(yaml_text) |
|
except Exception as e: |
|
print(f"Error parsing existing YAML front matter: {e}") |
|
yaml_block = {} |
|
else: |
|
yaml_block = {} |
|
|
|
|
|
if dataset_info: |
|
yaml_block["dataset_info"] = dataset_info |
|
|
|
if license_id: |
|
yaml_block["license"] = license_id |
|
|
|
|
|
new_yaml = yaml.dump(yaml_block, sort_keys=False, default_flow_style=False) |
|
new_yaml_block = f"---\n{new_yaml}---\n" |
|
|
|
|
|
if yaml_match: |
|
new_content = content[: yaml_match.start()] + new_yaml_block + content[yaml_match.end() :] |
|
else: |
|
new_content = new_yaml_block + content |
|
|
|
|
|
import tempfile |
|
|
|
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".md") as temp_file: |
|
temp_file.write(new_content) |
|
temp_path = temp_file.name |
|
|
|
|
|
try: |
|
api.upload_file( |
|
path_or_fileobj=temp_path, |
|
path_in_repo="README.md", |
|
repo_id=repo_id, |
|
repo_type="dataset", |
|
commit_message=commit_message, |
|
) |
|
print(f"Successfully updated README.md in {repo_id}") |
|
except Exception as e: |
|
print(f"Error updating README.md: {e}") |
|
|
|
|
|
import os |
|
|
|
os.unlink(temp_path) |
|
|