Spaces:
Sleeping
Sleeping
import os | |
import gradio as gr | |
import logging | |
import json | |
import hashlib | |
from pathlib import Path | |
from fontTools.ttLib import TTFont, TTLibError | |
from huggingface_hub import HfApi | |
logging.basicConfig(level=logging.INFO) | |
API = HfApi() | |
TOKEN = os.environ.get("TOKEN") | |
REPO_ID = "Felix92/docTR-resource-collection" | |
def get_supported_chars(font_path: Path) -> list[str]: | |
try: | |
font = TTFont(font_path) | |
supported_chars = set() | |
for table in font["cmap"].tables: | |
supported_chars.update(table.cmap.keys()) | |
chars = [chr(code_point) for code_point in sorted(supported_chars)] | |
return [char for char in chars if char.isprintable()] | |
except TTLibError as e: | |
logging.error(f"Error reading font file {font_path}: {e}") | |
return [] | |
except Exception as e: | |
logging.error(f"Unexpected error reading font file {font_path}: {e}") | |
return [] | |
def get_sha256(file_path: Path) -> str: | |
hash_sha256 = hashlib.sha256() | |
with open(file_path, "rb") as f: | |
for chunk in iter(lambda: f.read(8192), b""): | |
hash_sha256.update(chunk) | |
return hash_sha256.hexdigest() | |
def file_exists_on_hub(file_name: str, subfolder: str) -> bool: | |
files = API.list_repo_files( | |
repo_id=REPO_ID, | |
repo_type="dataset", | |
token=TOKEN, | |
) | |
return any(file.startswith(f"{subfolder}/{file_name}") for file in files) | |
def _upload_hub(file_path: str, subfolder: str, sha_hash: str) -> None: | |
filename = f"{sha_hash}_{Path(file_path).name}" | |
repo_path = f"{subfolder}/{filename}" | |
API.upload_file( | |
path_or_fileobj=file_path, | |
path_in_repo=repo_path, | |
token=TOKEN, | |
repo_type="dataset", | |
repo_id=REPO_ID, | |
) | |
logging.info(f"Uploaded {repo_path}") | |
def handle_uploads(font_upload, wordlist_upload, agree): | |
if not agree: | |
return gr.Markdown("You must agree to the terms and conditions before proceeding."), None, None, None | |
font_upload = font_upload or [] | |
wordlist_upload = wordlist_upload or [] | |
results = [] | |
try: | |
# Handle fonts | |
for font_file in font_upload: | |
font_path = Path(font_file) | |
font_sha = get_sha256(font_path) | |
if file_exists_on_hub(font_sha, "fonts"): | |
results.append(f"⚠️ Font **{font_path.name}** was already uploaded.") | |
continue | |
supported_chars = get_supported_chars(font_path) | |
if not supported_chars: | |
results.append(f"⚠️ Font **{font_path.name}** has no supported characters.") | |
continue | |
metadata = { | |
"font_name": font_path.stem, | |
"supported_characters": supported_chars, | |
} | |
json_path = font_path.with_suffix(".json") | |
with open(json_path, "w", encoding="utf-8") as f: | |
json.dump(metadata, f, ensure_ascii=False, indent=2) | |
json_sha = get_sha256(json_path) | |
_upload_hub(str(font_path), "fonts", font_sha) | |
_upload_hub(str(json_path), "fonts", json_sha) | |
results.append(f"✅ Font **{font_path.name}** uploaded successfully.") | |
# Handle wordlists | |
for wordlist_file in wordlist_upload: | |
wordlist_path = Path(wordlist_file) | |
wordlist_sha = get_sha256(wordlist_path) | |
if file_exists_on_hub(wordlist_sha, "wordlists"): | |
results.append(f"⚠️ Wordlist **{wordlist_path.name}** was already uploaded.") | |
continue | |
_upload_hub(str(wordlist_path), "wordlists", wordlist_sha) | |
results.append(f"✅ Wordlist **{wordlist_path.name}** uploaded successfully.") | |
if not results: | |
results.append("⚠️ No files uploaded.") | |
result_md = "<br>".join(results) | |
return gr.update(visible=False), gr.Markdown(f"<div style='text-align: center;'>{result_md}</div>"), gr.update(value=None), gr.update(value=None) | |
except Exception as e: | |
logging.exception("Upload failed") | |
return gr.update(visible=False), gr.Markdown(f"<div style='text-align: center;'><h3>An error occurred: {e}</h3></div>"), gr.update(value=None), gr.update(value=None) | |
with gr.Blocks(fill_height=True) as demo: | |
agreement_markdown = gr.Markdown( | |
""" | |
<div style="text-align: center;"> | |
<h1>File Upload Agreement</h1> | |
<h3>This is a Hugging Face space for the docTR/OnnxTR community to collect wordlists and fonts for the following project/s:</h3> | |
<h3><a href="https://github.com/mindee/doctr">docTR</a></h3> | |
<h3><a href="https://github.com/felixdittrich92/OnnxTR">OnnxTR</a></h3> | |
</div> | |
<h3>The uploaded wordlists and fonts will be used to generate synthetic data.</h3> | |
<h3>All uploaded files can be found here: <a href="https://huggingface.co/datasets/Felix92/docTR-resource-collection">Hugging Face dataset</a></h3> | |
<br> | |
<br> | |
<h3>By uploading a wordlist or font, you explicitly agree to the following terms:</h3> | |
<h3>1. You affirm that you are the owner or have the necessary rights to upload and share the wordlist or font.</h3> | |
<h3>2. You agree that the uploaded wordlists / fonts will be made publicly available to everyone.</h3> | |
<h3>3. You agree that the uploaded wordlists / fonts can be used for any purpose, including commercial use, by any third party.</h3> | |
""" | |
) | |
agree_button = gr.Button("I Agree to the Terms and Conditions") | |
agree_state = gr.State(value=False) | |
with gr.Column(visible=False) as upload_section: | |
success_message = gr.Markdown(visible=True) | |
font_upload = gr.File( | |
label="Upload Font File(s) [TTF | OTF]", | |
file_types=[".ttf", ".otf"], | |
type="filepath", | |
file_count="multiple" | |
) | |
wordlist_upload = gr.File( | |
label="Upload Wordlist(s) [TXT]", | |
file_types=[".txt"], | |
type="filepath", | |
file_count="multiple" | |
) | |
submit_button = gr.Button("Submit") | |
def toggle_agreement_visibility(): | |
return gr.update(visible=False), gr.update(visible=False), True, gr.update(visible=True) | |
agree_button.click(fn=toggle_agreement_visibility, inputs=None, outputs=[agreement_markdown, agree_button, agree_state, upload_section]) | |
submit_button.click( | |
fn=handle_uploads, | |
inputs=[font_upload, wordlist_upload, agree_state], | |
outputs=[agree_button, success_message, font_upload, wordlist_upload], | |
) | |
if __name__ == "__main__": | |
demo.launch() | |