Spaces:
Sleeping
Sleeping
File size: 6,655 Bytes
a9f7f11 bae6c77 a9f7f11 cc2aa56 a9f7f11 cc2aa56 a9f7f11 cc2aa56 a9f7f11 cc2aa56 a9f7f11 cc2aa56 a9f7f11 cc2aa56 a9f7f11 cc2aa56 a9f7f11 cc2aa56 a9f7f11 cc2aa56 a9f7f11 cc2aa56 a9f7f11 cc2aa56 a9f7f11 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
import os
import gradio as gr
import logging
import json
import hashlib
from pathlib import Path
from fontTools.ttLib import TTFont, TTLibError
from huggingface_hub import HfApi
logging.basicConfig(level=logging.INFO)
API = HfApi()
TOKEN = os.environ.get("TOKEN")
REPO_ID = "Felix92/docTR-resource-collection"
def get_supported_chars(font_path: Path) -> list[str]:
try:
font = TTFont(font_path)
supported_chars = set()
for table in font["cmap"].tables:
supported_chars.update(table.cmap.keys())
chars = [chr(code_point) for code_point in sorted(supported_chars)]
return [char for char in chars if char.isprintable()]
except TTLibError as e:
logging.error(f"Error reading font file {font_path}: {e}")
return []
except Exception as e:
logging.error(f"Unexpected error reading font file {font_path}: {e}")
return []
def get_sha256(file_path: Path) -> str:
hash_sha256 = hashlib.sha256()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(8192), b""):
hash_sha256.update(chunk)
return hash_sha256.hexdigest()
def file_exists_on_hub(file_name: str, subfolder: str) -> bool:
files = API.list_repo_files(
repo_id=REPO_ID,
repo_type="dataset",
token=TOKEN,
)
return any(file.startswith(f"{subfolder}/{file_name}") for file in files)
def _upload_hub(file_path: str, subfolder: str, sha_hash: str) -> None:
filename = f"{sha_hash}_{Path(file_path).name}"
repo_path = f"{subfolder}/{filename}"
API.upload_file(
path_or_fileobj=file_path,
path_in_repo=repo_path,
token=TOKEN,
repo_type="dataset",
repo_id=REPO_ID,
)
logging.info(f"Uploaded {repo_path}")
def handle_uploads(font_upload, wordlist_upload, agree):
if not agree:
return gr.Markdown("You must agree to the terms and conditions before proceeding."), None, None, None
font_upload = font_upload or []
wordlist_upload = wordlist_upload or []
results = []
try:
# Handle fonts
for font_file in font_upload:
font_path = Path(font_file)
font_sha = get_sha256(font_path)
if file_exists_on_hub(font_sha, "fonts"):
results.append(f"⚠️ Font **{font_path.name}** was already uploaded.")
continue
supported_chars = get_supported_chars(font_path)
if not supported_chars:
results.append(f"⚠️ Font **{font_path.name}** has no supported characters.")
continue
metadata = {
"font_name": font_path.stem,
"supported_characters": supported_chars,
}
json_path = font_path.with_suffix(".json")
with open(json_path, "w", encoding="utf-8") as f:
json.dump(metadata, f, ensure_ascii=False, indent=2)
json_sha = get_sha256(json_path)
_upload_hub(str(font_path), "fonts", font_sha)
_upload_hub(str(json_path), "fonts", json_sha)
results.append(f"✅ Font **{font_path.name}** uploaded successfully.")
# Handle wordlists
for wordlist_file in wordlist_upload:
wordlist_path = Path(wordlist_file)
wordlist_sha = get_sha256(wordlist_path)
if file_exists_on_hub(wordlist_sha, "wordlists"):
results.append(f"⚠️ Wordlist **{wordlist_path.name}** was already uploaded.")
continue
_upload_hub(str(wordlist_path), "wordlists", wordlist_sha)
results.append(f"✅ Wordlist **{wordlist_path.name}** uploaded successfully.")
if not results:
results.append("⚠️ No files uploaded.")
result_md = "<br>".join(results)
return gr.update(visible=False), gr.Markdown(f"<div style='text-align: center;'>{result_md}</div>"), gr.update(value=None), gr.update(value=None)
except Exception as e:
logging.exception("Upload failed")
return gr.update(visible=False), gr.Markdown(f"<div style='text-align: center;'><h3>An error occurred: {e}</h3></div>"), gr.update(value=None), gr.update(value=None)
with gr.Blocks(fill_height=True) as demo:
agreement_markdown = gr.Markdown(
"""
<div style="text-align: center;">
<h1>File Upload Agreement</h1>
<h3>This is a Hugging Face space for the docTR/OnnxTR community to collect wordlists and fonts for the following project/s:</h3>
<h3><a href="https://github.com/mindee/doctr">docTR</a></h3>
<h3><a href="https://github.com/felixdittrich92/OnnxTR">OnnxTR</a></h3>
</div>
<h3>The uploaded wordlists and fonts will be used to generate synthetic data.</h3>
<h3>All uploaded files can be found here: <a href="https://huggingface.co/datasets/Felix92/docTR-resource-collection">Hugging Face dataset</a></h3>
<br>
<br>
<h3>By uploading a wordlist or font, you explicitly agree to the following terms:</h3>
<h3>1. You affirm that you are the owner or have the necessary rights to upload and share the wordlist or font.</h3>
<h3>2. You agree that the uploaded wordlists / fonts will be made publicly available to everyone.</h3>
<h3>3. You agree that the uploaded wordlists / fonts can be used for any purpose, including commercial use, by any third party.</h3>
"""
)
agree_button = gr.Button("I Agree to the Terms and Conditions")
agree_state = gr.State(value=False)
with gr.Column(visible=False) as upload_section:
success_message = gr.Markdown(visible=True)
font_upload = gr.File(
label="Upload Font File(s) [TTF | OTF]",
file_types=[".ttf", ".otf"],
type="filepath",
file_count="multiple"
)
wordlist_upload = gr.File(
label="Upload Wordlist(s) [TXT]",
file_types=[".txt"],
type="filepath",
file_count="multiple"
)
submit_button = gr.Button("Submit")
def toggle_agreement_visibility():
return gr.update(visible=False), gr.update(visible=False), True, gr.update(visible=True)
agree_button.click(fn=toggle_agreement_visibility, inputs=None, outputs=[agreement_markdown, agree_button, agree_state, upload_section])
submit_button.click(
fn=handle_uploads,
inputs=[font_upload, wordlist_upload, agree_state],
outputs=[agree_button, success_message, font_upload, wordlist_upload],
)
if __name__ == "__main__":
demo.launch()
|