Felix92's picture
update to multifile
cc2aa56
raw
history blame contribute delete
6.66 kB
import os
import gradio as gr
import logging
import json
import hashlib
from pathlib import Path
from fontTools.ttLib import TTFont, TTLibError
from huggingface_hub import HfApi
logging.basicConfig(level=logging.INFO)
API = HfApi()
TOKEN = os.environ.get("TOKEN")
REPO_ID = "Felix92/docTR-resource-collection"
def get_supported_chars(font_path: Path) -> list[str]:
try:
font = TTFont(font_path)
supported_chars = set()
for table in font["cmap"].tables:
supported_chars.update(table.cmap.keys())
chars = [chr(code_point) for code_point in sorted(supported_chars)]
return [char for char in chars if char.isprintable()]
except TTLibError as e:
logging.error(f"Error reading font file {font_path}: {e}")
return []
except Exception as e:
logging.error(f"Unexpected error reading font file {font_path}: {e}")
return []
def get_sha256(file_path: Path) -> str:
hash_sha256 = hashlib.sha256()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(8192), b""):
hash_sha256.update(chunk)
return hash_sha256.hexdigest()
def file_exists_on_hub(file_name: str, subfolder: str) -> bool:
files = API.list_repo_files(
repo_id=REPO_ID,
repo_type="dataset",
token=TOKEN,
)
return any(file.startswith(f"{subfolder}/{file_name}") for file in files)
def _upload_hub(file_path: str, subfolder: str, sha_hash: str) -> None:
filename = f"{sha_hash}_{Path(file_path).name}"
repo_path = f"{subfolder}/{filename}"
API.upload_file(
path_or_fileobj=file_path,
path_in_repo=repo_path,
token=TOKEN,
repo_type="dataset",
repo_id=REPO_ID,
)
logging.info(f"Uploaded {repo_path}")
def handle_uploads(font_upload, wordlist_upload, agree):
if not agree:
return gr.Markdown("You must agree to the terms and conditions before proceeding."), None, None, None
font_upload = font_upload or []
wordlist_upload = wordlist_upload or []
results = []
try:
# Handle fonts
for font_file in font_upload:
font_path = Path(font_file)
font_sha = get_sha256(font_path)
if file_exists_on_hub(font_sha, "fonts"):
results.append(f"⚠️ Font **{font_path.name}** was already uploaded.")
continue
supported_chars = get_supported_chars(font_path)
if not supported_chars:
results.append(f"⚠️ Font **{font_path.name}** has no supported characters.")
continue
metadata = {
"font_name": font_path.stem,
"supported_characters": supported_chars,
}
json_path = font_path.with_suffix(".json")
with open(json_path, "w", encoding="utf-8") as f:
json.dump(metadata, f, ensure_ascii=False, indent=2)
json_sha = get_sha256(json_path)
_upload_hub(str(font_path), "fonts", font_sha)
_upload_hub(str(json_path), "fonts", json_sha)
results.append(f"✅ Font **{font_path.name}** uploaded successfully.")
# Handle wordlists
for wordlist_file in wordlist_upload:
wordlist_path = Path(wordlist_file)
wordlist_sha = get_sha256(wordlist_path)
if file_exists_on_hub(wordlist_sha, "wordlists"):
results.append(f"⚠️ Wordlist **{wordlist_path.name}** was already uploaded.")
continue
_upload_hub(str(wordlist_path), "wordlists", wordlist_sha)
results.append(f"✅ Wordlist **{wordlist_path.name}** uploaded successfully.")
if not results:
results.append("⚠️ No files uploaded.")
result_md = "<br>".join(results)
return gr.update(visible=False), gr.Markdown(f"<div style='text-align: center;'>{result_md}</div>"), gr.update(value=None), gr.update(value=None)
except Exception as e:
logging.exception("Upload failed")
return gr.update(visible=False), gr.Markdown(f"<div style='text-align: center;'><h3>An error occurred: {e}</h3></div>"), gr.update(value=None), gr.update(value=None)
with gr.Blocks(fill_height=True) as demo:
agreement_markdown = gr.Markdown(
"""
<div style="text-align: center;">
<h1>File Upload Agreement</h1>
<h3>This is a Hugging Face space for the docTR/OnnxTR community to collect wordlists and fonts for the following project/s:</h3>
<h3><a href="https://github.com/mindee/doctr">docTR</a></h3>
<h3><a href="https://github.com/felixdittrich92/OnnxTR">OnnxTR</a></h3>
</div>
<h3>The uploaded wordlists and fonts will be used to generate synthetic data.</h3>
<h3>All uploaded files can be found here: <a href="https://huggingface.co/datasets/Felix92/docTR-resource-collection">Hugging Face dataset</a></h3>
<br>
<br>
<h3>By uploading a wordlist or font, you explicitly agree to the following terms:</h3>
<h3>1. You affirm that you are the owner or have the necessary rights to upload and share the wordlist or font.</h3>
<h3>2. You agree that the uploaded wordlists / fonts will be made publicly available to everyone.</h3>
<h3>3. You agree that the uploaded wordlists / fonts can be used for any purpose, including commercial use, by any third party.</h3>
"""
)
agree_button = gr.Button("I Agree to the Terms and Conditions")
agree_state = gr.State(value=False)
with gr.Column(visible=False) as upload_section:
success_message = gr.Markdown(visible=True)
font_upload = gr.File(
label="Upload Font File(s) [TTF | OTF]",
file_types=[".ttf", ".otf"],
type="filepath",
file_count="multiple"
)
wordlist_upload = gr.File(
label="Upload Wordlist(s) [TXT]",
file_types=[".txt"],
type="filepath",
file_count="multiple"
)
submit_button = gr.Button("Submit")
def toggle_agreement_visibility():
return gr.update(visible=False), gr.update(visible=False), True, gr.update(visible=True)
agree_button.click(fn=toggle_agreement_visibility, inputs=None, outputs=[agreement_markdown, agree_button, agree_state, upload_section])
submit_button.click(
fn=handle_uploads,
inputs=[font_upload, wordlist_upload, agree_state],
outputs=[agree_button, success_message, font_upload, wordlist_upload],
)
if __name__ == "__main__":
demo.launch()