# Login to HuggingFace (just login once)

In [None]:
from huggingface_hub import interpreter_login
interpreter_login()

# Collect Menu Image Datasets
- Use `metadata.jsonl` to label the images's ground truth. You can visit [here](https://github.com/ryanlinjui/menu-text-detection/tree/main/examples) to see the examples.
- After finishing, push to HuggingFace Datasets.
- For labeling:
    - [Google AI Studio](https://aistudio.google.com) or [OpenAI ChatGPT](https://chatgpt.com).
    - Use function calling by API. Start the gradio app locally or visit [here](https://huggingface.co/spaces/ryanlinjui/menu-text-detection).

### Menu Type
- **h**: horizontal menu
- **v**: vertical menu
- **d**: document-style menu
- **s**: in-scene menu (non-document style)
- **i**: irregular menu (menu with irregular text layout)

> Please see the [examples](https://github.com/ryanlinjui/menu-text-detection/tree/main/examples) for more details.

In [None]:
from datasets import load_dataset

dataset = load_dataset(path="datasets/menu-zh-TW")      # load dataset from the local directory including the metadata.jsonl, images files.
dataset.push_to_hub(repo_id="ryanlinjui/menu-zh-TW")    # push to the huggingface dataset hub

# Setup for Fine-tuning

In [None]:
from datasets import load_dataset
from transformers import DonutProcessor, VisionEncoderDecoderModel, VisionEncoderDecoderConfig

from menu.donut import DonutDatasets

DATASETS_REPO_ID = "ryanlinjui/menu-zh-TW"              # set your dataset repo id for training
PRETRAINED_MODEL_REPO_ID = "naver-clova-ix/donut-base"  # set your pretrained model repo id for fine-tuning
TASK_PROMPT_NAME = "<s_menu>"                           # set your task prompt name for training
MAX_LENGTH = 768                                        # set your max length for maximum output length
IMAGE_SIZE = [1280, 960]                                # set your image size for training

raw_datasets = load_dataset(DATASETS_REPO_ID)

# Config: set the model config
config = VisionEncoderDecoderConfig.from_pretrained(PRETRAINED_MODEL_REPO_ID)
config.encoder.image_size = IMAGE_SIZE
config.decoder.max_length = MAX_LENGTH

# Processor: use the processor to process the dataset. 
# Convert the image to the tensor and the text to the token ids.
processor = DonutProcessor.from_pretrained(PRETRAINED_MODEL_REPO_ID)
processor.feature_extractor.size = IMAGE_SIZE[::-1]
processor.feature_extractor.do_align_long_axis = False

# DonutDatasets: use the DonutDatasets to process the dataset.
# For model inpit, the image must be converted to the tensor and the json text must be converted to the token with the task prompt string.
# This example sets the column name by "image" and "menu". So that image file is included in the "image" column and the json text is included in the "menu" column.
datasets = DonutDatasets(
    datasets=raw_datasets,
    processor=processor,
    image_column="image",
    annotation_column="menu",
    task_start_token=TASK_PROMPT_NAME,
    prompt_end_token=TASK_PROMPT_NAME,
    train_split=0.8,
    validation_split=0.1,
    test_split=0.1,
    sort_json_key=True,
    seed=42
)

# Model: load the pretrained model and set the config.
model = VisionEncoderDecoderModel.from_pretrained(PRETRAINED_MODEL_REPO_ID, config=config)
model.decoder.resize_token_embeddings(len(processor.tokenizer))
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.decoder_start_token_id = processor.tokenizer.convert_tokens_to_ids([TASK_PROMPT_NAME])[0]

# Start Fine-tuning

In [None]:
import torch
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

HUGGINGFACE_MODEL_ID = "ryanlinjui/donut-base-finetuned-menu" # set your huggingface model repo id for saving / pushing to the hub
EPOCHS = 100            # set your training epochs
TRAIN_BATCH_SIZE = 4    # set your training batch size

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)
print(f"Using {device} device")
model.to(device)

training_args = Seq2SeqTrainingArguments(
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    learning_rate=3e-5,
    per_device_eval_batch_size=1,
    output_dir="./.checkpoints",
    seed=2022,
    warmup_steps=30,
    eval_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=50,
    save_strategy="steps",
    save_steps=200,
    push_to_hub=True if HUGGINGFACE_MODEL_ID else False,
    hub_model_id=HUGGINGFACE_MODEL_ID,
    hub_strategy="every_save",
    report_to="tensorboard",
    logging_dir="./.checkpoints/logs",
)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["test"],
    tokenizer=processor
)

trainer.train()

In [None]:
from transformers import (
    VisionEncoderDecoderModel,
    DonutProcessor,
    pipeline
)
from PIL import Image

model_id = "ryanlinjui/donut-base-finetuned-menu"

# 1. 下載並載入 model + processor
processor = DonutProcessor.from_pretrained(model_id)
model     = VisionEncoderDecoderModel.from_pretrained(model_id)

# 2. 建立一個 image-to-text pipeline
ocr_pipeline = pipeline(
    "image-to-text",             # 使用 image-to-text 任務
    model=model,                 # 傳入已載入的 model
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
)

# 3. 載入一張測試圖片
image = Image.open("./examples/menu-hd.jpg")

# 4. 呼叫 pipeline，取得結果
outputs = ocr_pipeline(image)

# 5. 印出辨識文字
print(outputs[0]["generated_text"])

'''
# test model
import re

from transformers import VisionEncoderDecoderModel
from transformers import DonutProcessor
import torch
from PIL import Image

image = Image.open("./examples/menu-hd.jpg").convert("RGB")

processor = DonutProcessor.from_pretrained("ryanlinjui/donut-base-finetuned-menu")
model = VisionEncoderDecoderModel.from_pretrained("ryanlinjui/donut-base-finetuned-menu")
device = "cuda" if torch.cuda.is_available() else "mps"

model.eval()
model.to(device)

pixel_values = processor(image, return_tensors="pt").pixel_values
pixel_values = pixel_values.to(device)

task_prompt = "<s_menu>"
decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
decoder_input_ids = decoder_input_ids.to(device)
outputs = model.generate(
    pixel_values,
    decoder_input_ids=decoder_input_ids,
    max_length=model.decoder.config.max_position_embeddings,
    early_stopping=True,
    pad_token_id=processor.tokenizer.pad_token_id,
    eos_token_id=processor.tokenizer.eos_token_id,
    use_cache=True,
    num_beams=1,
    bad_words_ids=[[processor.tokenizer.unk_token_id]],
    return_dict_in_generate=True,
)

seq = processor.batch_decode(outputs.sequences)[0]
seq = seq.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
# seq = re.sub(r"<.*?>", "", seq, count=1).strip()  # remove first task start token
seq = processor.token2json(seq)
print(seq)
'''


# Plot the results

In [None]:
# Training Loss
# Validation Normal ED per each epoch 1~0, 1 -> 0.22
# Test Accuracy TED Accuracy, F1 Score Accuracy 0.687058, 0.51119 