import os import gradio as gr import logging import json import hashlib from pathlib import Path from fontTools.ttLib import TTFont, TTLibError from huggingface_hub import HfApi logging.basicConfig(level=logging.INFO) API = HfApi() TOKEN = os.environ.get("TOKEN") REPO_ID = "Felix92/docTR-resource-collection" def get_supported_chars(font_path: Path) -> list[str]: try: font = TTFont(font_path) supported_chars = set() for table in font["cmap"].tables: supported_chars.update(table.cmap.keys()) chars = [chr(code_point) for code_point in sorted(supported_chars)] return [char for char in chars if char.isprintable()] except TTLibError as e: logging.error(f"Error reading font file {font_path}: {e}") return [] except Exception as e: logging.error(f"Unexpected error reading font file {font_path}: {e}") return [] def get_sha256(file_path: Path) -> str: hash_sha256 = hashlib.sha256() with open(file_path, "rb") as f: for chunk in iter(lambda: f.read(8192), b""): hash_sha256.update(chunk) return hash_sha256.hexdigest() def file_exists_on_hub(file_name: str, subfolder: str) -> bool: files = API.list_repo_files( repo_id=REPO_ID, repo_type="dataset", token=TOKEN, ) return any(file.startswith(f"{subfolder}/{file_name}") for file in files) def _upload_hub(file_path: str, subfolder: str, sha_hash: str) -> None: filename = f"{sha_hash}_{Path(file_path).name}" repo_path = f"{subfolder}/{filename}" API.upload_file( path_or_fileobj=file_path, path_in_repo=repo_path, token=TOKEN, repo_type="dataset", repo_id=REPO_ID, ) logging.info(f"Uploaded {repo_path}") def handle_uploads(font_upload, wordlist_upload, agree): if not agree: return gr.Markdown("You must agree to the terms and conditions before proceeding."), None, None, None font_upload = font_upload or [] wordlist_upload = wordlist_upload or [] results = [] try: # Handle fonts for font_file in font_upload: font_path = Path(font_file) font_sha = get_sha256(font_path) if file_exists_on_hub(font_sha, "fonts"): results.append(f"⚠️ Font **{font_path.name}** was already uploaded.") continue supported_chars = get_supported_chars(font_path) if not supported_chars: results.append(f"⚠️ Font **{font_path.name}** has no supported characters.") continue metadata = { "font_name": font_path.stem, "supported_characters": supported_chars, } json_path = font_path.with_suffix(".json") with open(json_path, "w", encoding="utf-8") as f: json.dump(metadata, f, ensure_ascii=False, indent=2) json_sha = get_sha256(json_path) _upload_hub(str(font_path), "fonts", font_sha) _upload_hub(str(json_path), "fonts", json_sha) results.append(f"✅ Font **{font_path.name}** uploaded successfully.") # Handle wordlists for wordlist_file in wordlist_upload: wordlist_path = Path(wordlist_file) wordlist_sha = get_sha256(wordlist_path) if file_exists_on_hub(wordlist_sha, "wordlists"): results.append(f"⚠️ Wordlist **{wordlist_path.name}** was already uploaded.") continue _upload_hub(str(wordlist_path), "wordlists", wordlist_sha) results.append(f"✅ Wordlist **{wordlist_path.name}** uploaded successfully.") if not results: results.append("⚠️ No files uploaded.") result_md = "
".join(results) return gr.update(visible=False), gr.Markdown(f"
{result_md}
"), gr.update(value=None), gr.update(value=None) except Exception as e: logging.exception("Upload failed") return gr.update(visible=False), gr.Markdown(f"

An error occurred: {e}

"), gr.update(value=None), gr.update(value=None) with gr.Blocks(fill_height=True) as demo: agreement_markdown = gr.Markdown( """

File Upload Agreement

This is a Hugging Face space for the docTR/OnnxTR community to collect wordlists and fonts for the following project/s:

docTR

OnnxTR

The uploaded wordlists and fonts will be used to generate synthetic data.

All uploaded files can be found here: Hugging Face dataset



By uploading a wordlist or font, you explicitly agree to the following terms:

1. You affirm that you are the owner or have the necessary rights to upload and share the wordlist or font.

2. You agree that the uploaded wordlists / fonts will be made publicly available to everyone.

3. You agree that the uploaded wordlists / fonts can be used for any purpose, including commercial use, by any third party.

""" ) agree_button = gr.Button("I Agree to the Terms and Conditions") agree_state = gr.State(value=False) with gr.Column(visible=False) as upload_section: success_message = gr.Markdown(visible=True) font_upload = gr.File( label="Upload Font File(s) [TTF | OTF]", file_types=[".ttf", ".otf"], type="filepath", file_count="multiple" ) wordlist_upload = gr.File( label="Upload Wordlist(s) [TXT]", file_types=[".txt"], type="filepath", file_count="multiple" ) submit_button = gr.Button("Submit") def toggle_agreement_visibility(): return gr.update(visible=False), gr.update(visible=False), True, gr.update(visible=True) agree_button.click(fn=toggle_agreement_visibility, inputs=None, outputs=[agreement_markdown, agree_button, agree_state, upload_section]) submit_button.click( fn=handle_uploads, inputs=[font_upload, wordlist_upload, agree_state], outputs=[agree_button, success_message, font_upload, wordlist_upload], ) if __name__ == "__main__": demo.launch()