import os
import gradio as gr
import logging
import json
import hashlib
from pathlib import Path
from fontTools.ttLib import TTFont, TTLibError
from huggingface_hub import HfApi
logging.basicConfig(level=logging.INFO)
API = HfApi()
TOKEN = os.environ.get("TOKEN")
REPO_ID = "Felix92/docTR-resource-collection"
def get_supported_chars(font_path: Path) -> list[str]:
try:
font = TTFont(font_path)
supported_chars = set()
for table in font["cmap"].tables:
supported_chars.update(table.cmap.keys())
chars = [chr(code_point) for code_point in sorted(supported_chars)]
return [char for char in chars if char.isprintable()]
except TTLibError as e:
logging.error(f"Error reading font file {font_path}: {e}")
return []
except Exception as e:
logging.error(f"Unexpected error reading font file {font_path}: {e}")
return []
def get_sha256(file_path: Path) -> str:
hash_sha256 = hashlib.sha256()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(8192), b""):
hash_sha256.update(chunk)
return hash_sha256.hexdigest()
def file_exists_on_hub(file_name: str, subfolder: str) -> bool:
files = API.list_repo_files(
repo_id=REPO_ID,
repo_type="dataset",
token=TOKEN,
)
return any(file.startswith(f"{subfolder}/{file_name}") for file in files)
def _upload_hub(file_path: str, subfolder: str, sha_hash: str) -> None:
filename = f"{sha_hash}_{Path(file_path).name}"
repo_path = f"{subfolder}/{filename}"
API.upload_file(
path_or_fileobj=file_path,
path_in_repo=repo_path,
token=TOKEN,
repo_type="dataset",
repo_id=REPO_ID,
)
logging.info(f"Uploaded {repo_path}")
def handle_uploads(font_upload, wordlist_upload, agree):
if not agree:
return gr.Markdown("You must agree to the terms and conditions before proceeding."), None, None, None
font_upload = font_upload or []
wordlist_upload = wordlist_upload or []
results = []
try:
# Handle fonts
for font_file in font_upload:
font_path = Path(font_file)
font_sha = get_sha256(font_path)
if file_exists_on_hub(font_sha, "fonts"):
results.append(f"⚠️ Font **{font_path.name}** was already uploaded.")
continue
supported_chars = get_supported_chars(font_path)
if not supported_chars:
results.append(f"⚠️ Font **{font_path.name}** has no supported characters.")
continue
metadata = {
"font_name": font_path.stem,
"supported_characters": supported_chars,
}
json_path = font_path.with_suffix(".json")
with open(json_path, "w", encoding="utf-8") as f:
json.dump(metadata, f, ensure_ascii=False, indent=2)
json_sha = get_sha256(json_path)
_upload_hub(str(font_path), "fonts", font_sha)
_upload_hub(str(json_path), "fonts", json_sha)
results.append(f"✅ Font **{font_path.name}** uploaded successfully.")
# Handle wordlists
for wordlist_file in wordlist_upload:
wordlist_path = Path(wordlist_file)
wordlist_sha = get_sha256(wordlist_path)
if file_exists_on_hub(wordlist_sha, "wordlists"):
results.append(f"⚠️ Wordlist **{wordlist_path.name}** was already uploaded.")
continue
_upload_hub(str(wordlist_path), "wordlists", wordlist_sha)
results.append(f"✅ Wordlist **{wordlist_path.name}** uploaded successfully.")
if not results:
results.append("⚠️ No files uploaded.")
result_md = "
".join(results)
return gr.update(visible=False), gr.Markdown(f"