File size: 6,655 Bytes
a9f7f11
 
 
 
 
 
 
 
 
 
 
 
bae6c77
a9f7f11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc2aa56
 
 
 
 
a9f7f11
cc2aa56
 
 
a9f7f11
 
cc2aa56
 
a9f7f11
 
 
cc2aa56
 
 
a9f7f11
 
 
 
 
 
 
 
 
 
 
 
cc2aa56
a9f7f11
cc2aa56
 
 
a9f7f11
 
cc2aa56
 
a9f7f11
 
cc2aa56
a9f7f11
cc2aa56
 
 
 
 
a9f7f11
 
 
 
 
cc2aa56
a9f7f11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc2aa56
 
 
 
 
 
 
 
 
 
 
 
a9f7f11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import os
import gradio as gr
import logging
import json
import hashlib
from pathlib import Path
from fontTools.ttLib import TTFont, TTLibError
from huggingface_hub import HfApi

logging.basicConfig(level=logging.INFO)

API = HfApi()
TOKEN = os.environ.get("TOKEN")
REPO_ID = "Felix92/docTR-resource-collection"

def get_supported_chars(font_path: Path) -> list[str]:
    try:
        font = TTFont(font_path)
        supported_chars = set()
        for table in font["cmap"].tables:
            supported_chars.update(table.cmap.keys())
        chars = [chr(code_point) for code_point in sorted(supported_chars)]
        return [char for char in chars if char.isprintable()]
    except TTLibError as e:
        logging.error(f"Error reading font file {font_path}: {e}")
        return []
    except Exception as e:
        logging.error(f"Unexpected error reading font file {font_path}: {e}")
        return []

def get_sha256(file_path: Path) -> str:
    hash_sha256 = hashlib.sha256()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(8192), b""):
            hash_sha256.update(chunk)
    return hash_sha256.hexdigest()

def file_exists_on_hub(file_name: str, subfolder: str) -> bool:
    files = API.list_repo_files(
        repo_id=REPO_ID,
        repo_type="dataset",
        token=TOKEN,
    )
    return any(file.startswith(f"{subfolder}/{file_name}") for file in files)

def _upload_hub(file_path: str, subfolder: str, sha_hash: str) -> None:
    filename = f"{sha_hash}_{Path(file_path).name}"
    repo_path = f"{subfolder}/{filename}"
    API.upload_file(
        path_or_fileobj=file_path,
        path_in_repo=repo_path,
        token=TOKEN,
        repo_type="dataset",
        repo_id=REPO_ID,
    )
    logging.info(f"Uploaded {repo_path}")

def handle_uploads(font_upload, wordlist_upload, agree):
    if not agree:
        return gr.Markdown("You must agree to the terms and conditions before proceeding."), None, None, None

    font_upload = font_upload or []
    wordlist_upload = wordlist_upload or []

    results = []

    try:
        # Handle fonts
        for font_file in font_upload:
            font_path = Path(font_file)
            font_sha = get_sha256(font_path)
            if file_exists_on_hub(font_sha, "fonts"):
                results.append(f"⚠️ Font **{font_path.name}** was already uploaded.")
                continue

            supported_chars = get_supported_chars(font_path)
            if not supported_chars:
                results.append(f"⚠️ Font **{font_path.name}** has no supported characters.")
                continue

            metadata = {
                "font_name": font_path.stem,
                "supported_characters": supported_chars,
            }
            json_path = font_path.with_suffix(".json")
            with open(json_path, "w", encoding="utf-8") as f:
                json.dump(metadata, f, ensure_ascii=False, indent=2)

            json_sha = get_sha256(json_path)

            _upload_hub(str(font_path), "fonts", font_sha)
            _upload_hub(str(json_path), "fonts", json_sha)
            results.append(f"✅ Font **{font_path.name}** uploaded successfully.")

        # Handle wordlists
        for wordlist_file in wordlist_upload:
            wordlist_path = Path(wordlist_file)
            wordlist_sha = get_sha256(wordlist_path)
            if file_exists_on_hub(wordlist_sha, "wordlists"):
                results.append(f"⚠️ Wordlist **{wordlist_path.name}** was already uploaded.")
                continue

            _upload_hub(str(wordlist_path), "wordlists", wordlist_sha)
            results.append(f"✅ Wordlist **{wordlist_path.name}** uploaded successfully.")

        if not results:
            results.append("⚠️ No files uploaded.")

        result_md = "<br>".join(results)
        return gr.update(visible=False), gr.Markdown(f"<div style='text-align: center;'>{result_md}</div>"), gr.update(value=None), gr.update(value=None)

    except Exception as e:
        logging.exception("Upload failed")
        return gr.update(visible=False), gr.Markdown(f"<div style='text-align: center;'><h3>An error occurred: {e}</h3></div>"), gr.update(value=None), gr.update(value=None)


with gr.Blocks(fill_height=True) as demo:
    agreement_markdown = gr.Markdown(
        """
        <div style="text-align: center;">
        <h1>File Upload Agreement</h1>

        <h3>This is a Hugging Face space for the docTR/OnnxTR community to collect wordlists and fonts for the following project/s:</h3>

        <h3><a href="https://github.com/mindee/doctr">docTR</a></h3>

        <h3><a href="https://github.com/felixdittrich92/OnnxTR">OnnxTR</a></h3>
        </div>

        <h3>The uploaded wordlists and fonts will be used to generate synthetic data.</h3>

        <h3>All uploaded files can be found here: <a href="https://huggingface.co/datasets/Felix92/docTR-resource-collection">Hugging Face dataset</a></h3>

        <br>
        <br>

        <h3>By uploading a wordlist or font, you explicitly agree to the following terms:</h3>

        <h3>1. You affirm that you are the owner or have the necessary rights to upload and share the wordlist or font.</h3>

        <h3>2. You agree that the uploaded wordlists / fonts will be made publicly available to everyone.</h3>

        <h3>3. You agree that the uploaded wordlists / fonts can be used for any purpose, including commercial use, by any third party.</h3>
        """
    )
    agree_button = gr.Button("I Agree to the Terms and Conditions")
    agree_state = gr.State(value=False)

    with gr.Column(visible=False) as upload_section:
        success_message = gr.Markdown(visible=True)
        font_upload = gr.File(
            label="Upload Font File(s) [TTF | OTF]",
            file_types=[".ttf", ".otf"],
            type="filepath",
            file_count="multiple"
        )
        wordlist_upload = gr.File(
            label="Upload Wordlist(s) [TXT]",
            file_types=[".txt"],
            type="filepath",
            file_count="multiple"
        )
        submit_button = gr.Button("Submit")

    def toggle_agreement_visibility():
        return gr.update(visible=False), gr.update(visible=False), True, gr.update(visible=True)

    agree_button.click(fn=toggle_agreement_visibility, inputs=None, outputs=[agreement_markdown, agree_button, agree_state, upload_section])

    submit_button.click(
        fn=handle_uploads,
        inputs=[font_upload, wordlist_upload, agree_state],
        outputs=[agree_button, success_message, font_upload, wordlist_upload],
    )

if __name__ == "__main__":
    demo.launch()