Spaces:

Felix92
/

docTR-resources-collection

Sleeping

File size: 6,655 Bytes

import os
import gradio as gr
import logging
import json
import hashlib
from pathlib import Path
from fontTools.ttLib import TTFont, TTLibError
from huggingface_hub import HfApi

logging.basicConfig(level=logging.INFO)

API = HfApi()
TOKEN = os.environ.get("TOKEN")
REPO_ID = "Felix92/docTR-resource-collection"

def get_supported_chars(font_path: Path) -> list[str]:
    try:
        font = TTFont(font_path)
        supported_chars = set()
        for table in font["cmap"].tables:
            supported_chars.update(table.cmap.keys())
        chars = [chr(code_point) for code_point in sorted(supported_chars)]
        return [char for char in chars if char.isprintable()]
    except TTLibError as e:
        logging.error(f"Error reading font file {font_path}: {e}")
        return []
    except Exception as e:
        logging.error(f"Unexpected error reading font file {font_path}: {e}")
        return []

def get_sha256(file_path: Path) -> str:
    hash_sha256 = hashlib.sha256()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(8192), b""):
            hash_sha256.update(chunk)
    return hash_sha256.hexdigest()

def file_exists_on_hub(file_name: str, subfolder: str) -> bool:
    files = API.list_repo_files(
        repo_id=REPO_ID,
        repo_type="dataset",
        token=TOKEN,
    )
    return any(file.startswith(f"{subfolder}/{file_name}") for file in files)

def _upload_hub(file_path: str, subfolder: str, sha_hash: str) -> None:
    filename = f"{sha_hash}_{Path(file_path).name}"
    repo_path = f"{subfolder}/{filename}"
    API.upload_file(
        path_or_fileobj=file_path,
        path_in_repo=repo_path,
        token=TOKEN,
        repo_type="dataset",
        repo_id=REPO_ID,
    )
    logging.info(f"Uploaded {repo_path}")

def handle_uploads(font_upload, wordlist_upload, agree):
    if not agree:
        return gr.Markdown("You must agree to the terms and conditions before proceeding."), None, None, None

    font_upload = font_upload or []
    wordlist_upload = wordlist_upload or []

    results = []

    try:
        # Handle fonts
        for font_file in font_upload:
            font_path = Path(font_file)
            font_sha = get_sha256(font_path)
            if file_exists_on_hub(font_sha, "fonts"):
                results.append(f"⚠️ Font **{font_path.name}** was already uploaded.")
                continue

            supported_chars = get_supported_chars(font_path)
            if not supported_chars:
                results.append(f"⚠️ Font **{font_path.name}** has no supported characters.")
                continue

            metadata = {
                "font_name": font_path.stem,
                "supported_characters": supported_chars,
            }
            json_path = font_path.with_suffix(".json")
            with open(json_path, "w", encoding="utf-8") as f:
                json.dump(metadata, f, ensure_ascii=False, indent=2)

            json_sha = get_sha256(json_path)

            _upload_hub(str(font_path), "fonts", font_sha)
            _upload_hub(str(json_path), "fonts", json_sha)
            results.append(f"✅ Font **{font_path.name}** uploaded successfully.")

        # Handle wordlists
        for wordlist_file in wordlist_upload:
            wordlist_path = Path(wordlist_file)
            wordlist_sha = get_sha256(wordlist_path)
            if file_exists_on_hub(wordlist_sha, "wordlists"):
                results.append(f"⚠️ Wordlist **{wordlist_path.name}** was already uploaded.")
                continue

            _upload_hub(str(wordlist_path), "wordlists", wordlist_sha)
            results.append(f"✅ Wordlist **{wordlist_path.name}** uploaded successfully.")

        if not results:
            results.append("⚠️ No files uploaded.")

        result_md = "<br>".join(results)
        return gr.update(visible=False), gr.Markdown(f"<div style='text-align: center;'>{result_md}</div>"), gr.update(value=None), gr.update(value=None)

    except Exception as e:
        logging.exception("Upload failed")
        return gr.update(visible=False), gr.Markdown(f"<div style='text-align: center;'><h3>An error occurred: {e}</h3></div>"), gr.update(value=None), gr.update(value=None)


with gr.Blocks(fill_height=True) as demo:
    agreement_markdown = gr.Markdown(
        """
        <div style="text-align: center;">
        <h1>File Upload Agreement</h1>

        <h3>This is a Hugging Face space for the docTR/OnnxTR community to collect wordlists and fonts for the following project/s:</h3>

        <h3><a href="https://github.com/mindee/doctr">docTR</a></h3>

        <h3><a href="https://github.com/felixdittrich92/OnnxTR">OnnxTR</a></h3>
        </div>

        <h3>The uploaded wordlists and fonts will be used to generate synthetic data.</h3>

        <h3>All uploaded files can be found here: <a href="https://huggingface.co/datasets/Felix92/docTR-resource-collection">Hugging Face dataset</a></h3>

        <br>
        <br>

        <h3>By uploading a wordlist or font, you explicitly agree to the following terms:</h3>

        <h3>1. You affirm that you are the owner or have the necessary rights to upload and share the wordlist or font.</h3>

        <h3>2. You agree that the uploaded wordlists / fonts will be made publicly available to everyone.</h3>

        <h3>3. You agree that the uploaded wordlists / fonts can be used for any purpose, including commercial use, by any third party.</h3>
        """
    )
    agree_button = gr.Button("I Agree to the Terms and Conditions")
    agree_state = gr.State(value=False)

    with gr.Column(visible=False) as upload_section:
        success_message = gr.Markdown(visible=True)
        font_upload = gr.File(
            label="Upload Font File(s) [TTF | OTF]",
            file_types=[".ttf", ".otf"],
            type="filepath",
            file_count="multiple"
        )
        wordlist_upload = gr.File(
            label="Upload Wordlist(s) [TXT]",
            file_types=[".txt"],
            type="filepath",
            file_count="multiple"
        )
        submit_button = gr.Button("Submit")

    def toggle_agreement_visibility():
        return gr.update(visible=False), gr.update(visible=False), True, gr.update(visible=True)

    agree_button.click(fn=toggle_agreement_visibility, inputs=None, outputs=[agreement_markdown, agree_button, agree_state, upload_section])

    submit_button.click(
        fn=handle_uploads,
        inputs=[font_upload, wordlist_upload, agree_state],
        outputs=[agree_button, success_message, font_upload, wordlist_upload],
    )

if __name__ == "__main__":
    demo.launch()