Spaces:

ryanlinjui
/

menu-text-detection

Running

App Files Files Community

github-actions[bot] commited on 3 days ago

Commit

8af6af2

0 Parent(s):

Sync from https://github.com/ryanlinjui/menu-text-detection

Browse files

Files changed (19) hide show

.checkpoints/.gitkeep +0 -0
.env.example +3 -0
.github/workflows/sync.yml +25 -0
.gitignore +24 -0
.python-version +1 -0
LICENSE +21 -0
README.md +65 -0
app.py +151 -0
menu/donut.py +225 -0
menu/llm/__init__.py +2 -0
menu/llm/base.py +9 -0
menu/llm/gemini.py +36 -0
menu/llm/openai.py +39 -0
pyproject.toml +23 -0
requirements.txt +169 -0
tools/schema_gemini.json +45 -0
tools/schema_openai.json +48 -0
train.ipynb +294 -0
uv.lock +0 -0

.checkpoints/.gitkeep ADDED Viewed

File without changes

.env.example ADDED Viewed

	@@ -0,0 +1,3 @@

+HUGGINGFACE_TOKEN="HUGGINGFACE_TOKEN"
+GIMINI_API_TOKEN="GIMINI_API_TOKEN"
+OPENAI_API_TOKEN="OPENAI_API_TOKEN"

.github/workflows/sync.yml ADDED Viewed

	@@ -0,0 +1,25 @@

+name: Sync to Hugging Face Spaces
+on:
+    push:
+        branches:
+            - main
+jobs:
+    sync:
+        name: Sync
+        runs-on: ubuntu-latest
+        steps:
+            - name: Checkout Repository
+              uses: actions/checkout@v4
+            - name: Remove bad files
+              run: rm -rf examples assets
+            - name: Sync to Hugging Face Spaces
+              uses: JacobLinCool/huggingface-sync@v1
+              with:
+                  github: ${{ secrets.GITHUB_TOKEN }}
+                  user: ryanlinjui # Hugging Face username or organization name
+                  space: menu-text-detection # Hugging Face space name
+                  token: ${{ secrets.HF_TOKEN }} # Hugging Face token
+                  python_version: 3.11 # Python version

.gitignore ADDED Viewed

	@@ -0,0 +1,24 @@

+# mac
+.DS_Store
+# cache
+__pycache__
+# datasets
+datasets
+# papers
+docs/papers
+# uv
+.venv
+# gradio
+.gradio
+# env
+.env
+# checkpoint
+.checkpoints/*
+!.checkpoints/.gitkeep

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.11

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 RyanLin
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,65 @@

+---
+title: menu text detection
+emoji: 🦄
+colorFrom: indigo
+colorTo: pink
+sdk: gradio
+python_version: 3.11
+short_description: Extract structured menu information from images into JSON...
+tags: [ "donut","fine-tuning","image-to-text","transformer" ]
+---
+# Menu Text Detection System
+Extract structured menu information from images into JSON using a fine-tuned Donut E2E model.
+> Based on [Donut by Clova AI (ECCV ’22)](https://github.com/clovaai/donut)
+<div align="center">
+<img src="./assets/demo.gif" alt="demo" width="500"/><br>
+[![Gradio Space Demo](https://img.shields.io/badge/GradioSpace-Demo-important?logo=huggingface)](https://huggingface.co/spaces/ryanlinjui/menu-text-detection)<br>
+[![Hugging Face Models & Datasets](https://img.shields.io/badge/HuggingFace-Models_&_Datasets-important?logo=huggingface)](https://huggingface.co/collections/ryanlinjui/menu-text-detection-670ccf527626bb004bbfb39b)
+</div>
+## 🚀 Features
+### Overview
+Currently supports the following information from menu images:
+- **Restaurant Name**
+- **Business Hours**
+- **Address**
+- **Phone Number**
+- **Dish Information**
+  - Name
+  - Price
+> For the JSON schema, see [tools directory](./tools).
+### Supported Methods to Extract Menu Information
+- Fine-tuned Donut model
+- OpenAI GPT API
+- Google Gemini API
+## 💻 Training / Fine-Tuning
+### Setup
+Use [uv](https://github.com/astral-sh/uv) to set up the development environment:
+```bash
+uv sync
+```
+### Training Script (Datasets collecting, Fine-Tuning)
+Please refer [`train.ipynb`](./train.ipynb). Use Jupyter Notebook for training:
+```bash
+uv run jupyter-notebook
+```
+> For VSCode users, please install Jupyter extension, then select `.venv/bin/python` as your kernel.
+### Run Demo Locally
+```bash
+uv run python app.py
+```

app.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import os
+import json
+import numpy as np
+import gradio as gr
+from dotenv import load_dotenv
+from menu.llm import (
+    GeminiAPI,
+    OpenAIAPI
+)
+from menu.donut import DonutFinetuned
+load_dotenv()
+GEMINI_API_TOKEN = os.getenv("GIMINI_API_TOKEN", "")
+OPENAI_API_TOKEN = os.getenv("OPENAI_API_TOKEN", "")
+SOURCE_CODE_GH_URL = "https://github.com/ryanlinjui/menu-text-detection"
+BADGE_URL = "https://img.shields.io/badge/GitHub_Code-Click_Here!!-default?logo=github"
+GITHUB_RAW_URL = "https://raw.githubusercontent.com/ryanlinjui/menu-text-detection/main"
+EXAMPLE_IMAGE_LIST = [
+    f"{GITHUB_RAW_URL}/examples/menu-hd.jpg",
+    f"{GITHUB_RAW_URL}/examples/menu-vs.jpg",
+    f"{GITHUB_RAW_URL}/examples/menu-si.jpg"
+]
+MODEL_LIST = [
+    "Donut Model",
+    "gemini-2.0-flash",
+    "gemini-2.5-flash-preview-04-17",
+    "gemini-2.5-pro-preview-03-25",
+    "gpt-4.1",
+    "gpt-4o",
+    "o4-mini"
+]
+def handle(image: np.ndarray, model: str, api_token: str) -> str:
+    if image is None:
+        raise gr.Error("Please upload an image first.")
+    if model == MODEL_LIST[0]:
+        result = DonutFinetuned.predict(image)
+    elif model in MODEL_LIST[1:]:
+        if len(api_token) < 10:
+            raise gr.Error(f"Please provide a valid token for {model}.")
+        try:
+            if model in MODEL_LIST[1:4]:
+                result = GeminiAPI.call(image, model, api_token)
+            else:
+                result = OpenAIAPI.call(image, model, api_token)
+        except Exception as e:
+            raise gr.Error(f"Failed to process with API model {model}: {str(e)}")
+    else:
+        raise gr.Error("Invalid model selection. Please choose a valid model.")
+    return json.dumps(result, indent=4, ensure_ascii=False)
+def UserInterface() -> gr.Interface:
+    with gr.Blocks(
+        delete_cache=(86400, 86400),
+        css="""
+        .image-panel {
+            display: flex;
+            flex-direction: column;
+            height: 600px;
+        }
+        .image-panel img {
+            object-fit: contain;
+            max-height: 600px;
+            max-width: 600px;
+            width: 100%;
+        }
+        .large-text textarea {
+            font-size: 20px !important;
+            height: 600px !important;
+            width: 100% !important;
+        }
+        """
+    ) as gradio_interface:
+        gr.HTML(f'<a href="{SOURCE_CODE_GH_URL}"><img src="{BADGE_URL}" alt="GitHub Code"/></a>')
+        gr.Markdown("# Menu Text Detection")
+        with gr.Row():
+            with gr.Column(scale=1, min_width=500):
+                gr.Markdown("## 📷 Menu Image")
+                menu_image = gr.Image(
+                    type="numpy",
+                    label="Input menu image",
+                    elem_classes="image-panel"
+                )
+                gr.Markdown("## 🤖 Model Selection")
+                model_choice_dropdown = gr.Dropdown(
+                    choices=MODEL_LIST,
+                    value=MODEL_LIST[0],
+                    label="Select Text Detection Model"
+                )
+                api_token_textbox = gr.Textbox(
+                    label="API Token",
+                    placeholder="Enter your API token here...",
+                    type="password",
+                    visible=False
+                )
+                generate_button = gr.Button("Generate Menu Information", variant="primary")
+                gr.Examples(
+                    examples=EXAMPLE_IMAGE_LIST,
+                    inputs=menu_image,
+                    label="Example Menu Images"
+                )
+            with gr.Column(scale=1):
+                gr.Markdown("## 🍽️ Menu Info")
+                menu_json_textbox = gr.Textbox(
+                    label="Ouput JSON",
+                    interactive=False,
+                    text_align="left",
+                    elem_classes="large-text"
+                )
+        def update_token_visibility(choice):
+            if choice in MODEL_LIST[1:]:
+                current_token = ""
+                if choice in MODEL_LIST[1:4]:
+                    current_token = GEMINI_API_TOKEN
+                elif choice in MODEL_LIST[4:]:
+                    current_token = OPENAI_API_TOKEN
+                return gr.update(visible=True, value=current_token)
+            else:
+                return gr.update(visible=False)
+        model_choice_dropdown.change(
+            fn=update_token_visibility,
+            inputs=model_choice_dropdown,
+            outputs=api_token_textbox
+        )
+        generate_button.click(
+            fn=handle,
+            inputs=[menu_image, model_choice_dropdown, api_token_textbox],
+            outputs=menu_json_textbox
+        )
+    return gradio_interface
+if __name__ == "__main__":
+    demo = UserInterface()
+    demo.launch()

menu/donut.py ADDED Viewed

	@@ -0,0 +1,225 @@

+import json
+from typing import Any, Dict, Optional
+import numpy as np
+from PIL import Image
+from datasets import DatasetDict
+from torch.utils.data import Dataset
+from transformers import pipeline, DonutProcessor
+class DonutFinetuned:
+    DEFAULT_PIPELINE = pipeline(
+        task="image-to-text",
+        model="naver-clova-ix/donut-base"
+    )
+    @classmethod
+    def predict(cls, image: np.ndarray) -> dict:
+        image = Image.fromarray(image)
+        result = cls.DEFAULT_PIPELINE(image)
+        return result
+class DonutDatasets:
+    """
+    Modified from:
+        https://github.com/NielsRogge/Transformers-Tutorials/blob/master/Donut/CORD/Fine_tune_Donut_on_a_custom_dataset_(CORD)_with_PyTorch_Lightning.ipynb
+    Donut PyTorch Dataset Wrapper (supports train/validation/test splits)
+        - Dynamic field names and JSON-to-token conversion
+        - Returns PyTorch Datasets with __getitem__ producing tensors
+        - Splits controlled by train_split/validation_split/test_split
+        - Only single JSON annotation supported
+        - Supports subscripting: datasets["train"], datasets["validation"], datasets["test"]
+    Args:
+        - datasets: DatasetDict containing train/validation/test splits
+        - processor: DonutProcessor for image processing
+        - image_column: Column name for images in the dataset
+        - annotation_column: Column name for annotations in the dataset
+        - task_start_token: Token to start the task
+        - prompt_end_token: Token to end the prompt
+        - max_length: Maximum length of tokenized sequences
+        - train_split: Fraction of data to use for training (0.0-1.0)
+        - validation_split: Fraction of data to use for validation (0.0-1.0)
+        - test_split: Fraction of data to use for testing (0.0-1.0)
+        - ignore_index: Index to ignore in labels (default: -100)
+        - sort_json_key: Whether to sort JSON keys (default: True)
+        - seed: Random seed for reproducibility. If None, use OS random seed (default: None)
+        - shuffle: Whether to shuffle the dataset (default: True)
+    Returns:
+        - DonutDatasets object with train/validation/test splits
+    Example:
+        datasets = DonutDatasets(
+            datasets=dataset_dict,
+            processor=processor,
+            image_column="image",
+            annotation_column="annotation",
+            task_start_token="<s_task>",
+            prompt_end_token="<s_prompt>",
+            max_length=512,
+            train_split=0.8,
+            validation_split=0.1,
+            test_split=0.1
+        )
+        train_dataset = datasets["train"]
+        validation_dataset = datasets["validation"]
+        test_dataset = datasets["test"]
+    Note:
+        - The dataset must be a DatasetDict with train/validation/test splits
+        - The processor must be a DonutProcessor instance
+        - The image_column and annotation_column must exist in the dataset
+        - The task_start_token and prompt_end_token must be unique tokens
+        - The max_length should be set according to the model's maximum input length
+        - The ignore_index is used for padding in labels (default: -100)
+        - The sort_json_key option determines whether JSON keys are sorted or not
+    """
+    def __init__(
+        self,
+        datasets: DatasetDict,
+        processor: DonutProcessor,
+        image_column: str,
+        annotation_column: str,
+        task_start_token: str,
+        prompt_end_token: str,
+        max_length: int = 512,
+        train_split: float = 1.0,
+        validation_split: float = 0.0,
+        test_split: float = 0.0,
+        ignore_index: int = -100,
+        sort_json_key: bool = True,
+        seed: Optional[int] = None,
+        shuffle: bool = True
+    ):
+        assert abs(train_split + validation_split + test_split - 1.0) < 1e-6, (
+            "train/validation/test splits must sum to 1"
+        )
+        self.processor = processor
+        self.tokenizer = processor.tokenizer
+        self.image_column = image_column
+        self.annotation_column = annotation_column
+        self.max_length = max_length
+        self.task_start_token = task_start_token
+        self.prompt_end_token = prompt_end_token or task_start_token
+        self.ignore_index = ignore_index
+        self.sort_json_key = sort_json_key
+        # Perform split on provided datasets
+        raw = datasets
+        parts: Dict[str, Any] = {}
+        if train_split < 1.0:
+            split1 = raw["train"].train_test_split(test_size=1 - train_split, seed=seed, shuffle=shuffle)
+            parts["train"] = split1["train"]
+            rest = split1["test"]
+            if validation_split > 0:
+                val_frac = validation_split / (validation_split + test_split)
+                split2 = rest.train_test_split(test_size=1 - val_frac, seed=seed, shuffle=shuffle)
+                parts["validation"] = split2["train"]
+                parts["test"] = split2["test"]
+            else:
+                parts["test"] = rest
+        else:
+            parts = dict(raw)
+        # Create individual split datasets
+        self._splits: Dict[str, Dataset] = {}
+        for name, ds in parts.items():
+            self._splits[name] = _SplitDataset(
+                hf_dataset=ds,
+                processor=self.processor,
+                image_column=self.image_column,
+                annotation_column=self.annotation_column,
+                max_length=self.max_length,
+                ignore_index=self.ignore_index,
+                sort_json_key=self.sort_json_key,
+                task_start_token=self.task_start_token,
+                prompt_end_token=self.prompt_end_token,
+            )
+    def __getitem__(self, split: str) -> Dataset:
+        """
+        Return the dataset split by name, e.g., datasets["train"]
+        """
+        if split in self._splits:
+            return self._splits[split]
+        raise KeyError(f"Unknown split '{split}'. Available splits: {list(self._splits.keys())}")
+    def __repr__(self):
+        return f"DonutDatasets(splits={list(self._splits.keys())})"
+class _SplitDataset(Dataset):
+    """
+    PyTorch Dataset for a single split, returns (pixel_values, labels, target_sequence)
+    """
+    def __init__(
+        self,
+        hf_dataset,
+        processor: DonutProcessor,
+        image_column: str,
+        annotation_column: str,
+        max_length: int,
+        ignore_index: int,
+        sort_json_key: bool,
+        task_start_token: str,
+        prompt_end_token: str,
+    ):
+        self.processor = processor
+        self.tokenizer = processor.tokenizer
+        self.hf_dataset = hf_dataset
+        self.image_column = image_column
+        self.annotation_column = annotation_column
+        self.max_length = max_length
+        self.ignore_index = ignore_index
+        self.sort_json_key = sort_json_key
+        self.task_start_token = task_start_token
+        self.prompt_end_token = prompt_end_token
+        # Prepare tokenized ground-truth sequences (single annotation)
+        self.gt_token_sequences = []
+        for sample in self.hf_dataset:
+            gt = sample[self.annotation_column]
+            if isinstance(gt, str):
+                gt = json.loads(gt)
+            seq = self._json_to_token(gt) + self.tokenizer.eos_token
+            self.gt_token_sequences.append(seq)
+        # Add special tokens to tokenizer
+        self.tokenizer.add_tokens([self.task_start_token, self.prompt_end_token])
+    def _json_to_token(self, obj: Any) -> str:
+        if isinstance(obj, dict):
+            keys = sorted(obj.keys()) if self.sort_json_key else obj.keys()
+            seq = ""
+            for k in keys:
+                open_tag = f"<s_{k}>"
+                close_tag = f"</s_{k}>"
+                self.tokenizer.add_special_tokens({"additional_special_tokens": [open_tag, close_tag]})
+                seq += open_tag + self._json_to_token(obj[k]) + close_tag
+            return seq
+        if isinstance(obj, list):
+            return r"<sep/>".join(self._json_to_token(x) for x in obj)
+        return str(obj)
+    def __len__(self):
+        return len(self.hf_dataset)
+    def __getitem__(self, idx: int):
+        sample = self.hf_dataset[idx]
+        pixel_values = self.processor(sample[self.image_column], return_tensors="pt").pixel_values.squeeze()
+        target_seq = self.gt_token_sequences[idx]
+        tokens = self.tokenizer(
+            target_seq,
+            add_special_tokens=False,
+            max_length=self.max_length,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+        )
+        input_ids = tokens.input_ids.squeeze(0)
+        labels = input_ids.clone()
+        labels[labels == self.tokenizer.pad_token_id] = self.ignore_index
+        return {
+            "pixel_values": pixel_values,
+            "input_ids": input_ids,
+            "attention_mask": tokens.attention_mask.squeeze(0),
+            "labels": labels,
+            "target_sequence": target_seq
+        }

menu/llm/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .gemini import GeminiAPI
2	+ from .openai import OpenAIAPI

menu/llm/base.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from abc import ABC, abstractmethod
+import numpy as np
+class LLMBase(ABC):
+    @classmethod
+    @abstractmethod
+    def call(image: np.ndarray, model: str, token: str) -> dict:
+        raise NotImplementedError

menu/llm/gemini.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import json
+import numpy as np
+from PIL import Image
+from google import genai
+from google.genai import types
+from .base import LLMBase
+FUNCTION_CALL = json.load(open("tools/schema_gemini.json", "r"))
+class GeminiAPI(LLMBase):
+    @classmethod
+    def call(cls, image: np.ndarray, model: str, token: str) -> dict:
+        client = genai.Client(api_key=token) # Initialize the client with the API key
+        encode_img = Image.fromarray(image) # Convert the image for the API
+        config = types.GenerateContentConfig(
+            tools=[types.Tool(function_declarations=[FUNCTION_CALL])],
+            tool_config={
+                "function_calling_config": {
+                    "mode": "ANY",
+                    "allowed_function_names": [FUNCTION_CALL["name"]]
+                }
+            }
+        )
+        response = client.models.generate_content(
+            model=model,
+            contents=[encode_img],
+            config=config
+        )
+        if response.candidates[0].content.parts[0].function_call:
+            function_call = response.candidates[0].content.parts[0].function_call
+            return function_call.args
+        return {}

menu/llm/openai.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import json
+import base64
+from io import BytesIO
+import numpy as np
+from PIL import Image
+from openai import OpenAI
+from .base import LLMBase
+FUNCTION_CALL = json.load(open("tools/schema_openai.json", "r"))
+class OpenAIAPI(LLMBase):
+    @classmethod
+    def call(cls, image: np.ndarray, model: str, token: str) -> dict:
+        client = OpenAI(api_key=token)  # Initialize the client with the API key
+        buffer = BytesIO()
+        Image.fromarray(image).save(buffer, format="JPEG")
+        encode_img = base64.b64encode(buffer.getvalue()).decode("utf-8") # Convert the image for the API
+        response = client.responses.create(
+            model=model,
+            input=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "input_image",
+                            "image_url": f"data:image/jpeg;base64,{encode_img}",
+                        },
+                    ],
+                }
+            ],
+            tools=[FUNCTION_CALL],
+        )
+        if response and response.output:
+            if hasattr(response.output[0], "arguments"):
+                return json.loads(response.output[0].arguments)
+        return {}

pyproject.toml ADDED Viewed

	@@ -0,0 +1,23 @@

+[project]
+authors = [{name = "ryanlinjui", email = "[email protected]"}]
+name = "menu-text-detection"
+version = "0.1.0"
+description = "Extract structured menu information from images into JSON using a fine-tuned Donut E2E model."
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "accelerate>=1.6.0",
+    "datasets>=3.6.0",
+    "dotenv>=0.9.9",
+    "google-genai>=1.14.0",
+    "gradio>=5.29.0",
+    "huggingface-hub>=0.31.1",
+    "matplotlib>=3.10.1",
+    "notebook>=7.4.2",
+    "openai>=1.77.0",
+    "pillow>=11.2.1",
+    "protobuf>=6.30.2",
+    "sentencepiece>=0.2.0",
+    "tensorboardx>=2.6.2.2",
+    "transformers>=4.51.3",
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,169 @@

+accelerate==1.6.0
+aiofiles==24.1.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.11.18
+aiosignal==1.3.2
+annotated-types==0.7.0
+anyio==4.9.0
+appnope==0.1.4
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==3.0.0
+async-lru==2.0.5
+attrs==25.3.0
+babel==2.17.0
+beautifulsoup4==4.13.4
+bleach==6.2.0
+cachetools==5.5.2
+certifi==2025.4.26
+cffi==1.17.1
+charset-normalizer==3.4.2
+click==8.1.8
+comm==0.2.2
+contourpy==1.3.2
+cycler==0.12.1
+datasets==3.6.0
+debugpy==1.8.14
+decorator==5.2.1
+defusedxml==0.7.1
+dill==0.3.8
+distro==1.9.0
+dotenv==0.9.9
+executing==2.2.0
+fastapi==0.115.12
+fastjsonschema==2.21.1
+ffmpy==0.5.0
+filelock==3.18.0
+fonttools==4.57.0
+fqdn==1.5.1
+frozenlist==1.6.0
+fsspec==2025.3.0
+google-auth==2.40.1
+google-genai==1.14.0
+gradio==5.29.0
+gradio-client==1.10.0
+groovy==0.1.2
+h11==0.16.0
+hf-xet==1.1.0
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.31.1
+idna==3.10
+ipykernel==6.29.5
+ipython==9.2.0
+ipython-pygments-lexers==1.1.1
+isoduration==20.11.0
+jedi==0.19.2
+jinja2==3.1.6
+jiter==0.9.0
+json5==0.12.0
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2025.4.1
+jupyter-client==8.6.3
+jupyter-core==5.7.2
+jupyter-events==0.12.0
+jupyter-lsp==2.2.5
+jupyter-server==2.15.0
+jupyter-server-terminals==0.5.3
+jupyterlab==4.4.2
+jupyterlab-pygments==0.3.0
+jupyterlab-server==2.27.3
+kiwisolver==1.4.8
+markdown-it-py==3.0.0
+markupsafe==3.0.2
+matplotlib==3.10.1
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mistune==3.1.3
+mpmath==1.3.0
+multidict==6.4.3
+multiprocess==0.70.16
+nbclient==0.10.2
+nbconvert==7.16.6
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.4.2
+notebook==7.4.2
+notebook-shim==0.2.4
+numpy==2.2.5
+openai==1.77.0
+orjson==3.10.18
+overrides==7.7.0
+packaging==25.0
+pandas==2.2.3
+pandocfilters==1.5.1
+parso==0.8.4
+pexpect==4.9.0
+pillow==11.2.1
+platformdirs==4.3.8
+prometheus-client==0.21.1
+prompt-toolkit==3.0.51
+propcache==0.3.1
+protobuf==6.30.2
+psutil==7.0.0
+ptyprocess==0.7.0
+pure-eval==0.2.3
+pyarrow==20.0.0
+pyasn1==0.6.1
+pyasn1-modules==0.4.2
+pycparser==2.22
+pydantic==2.11.4
+pydantic-core==2.33.2
+pydub==0.25.1
+pygments==2.19.1
+pyparsing==3.2.3
+python-dateutil==2.9.0.post0
+python-dotenv==1.1.0
+python-json-logger==3.3.0
+python-multipart==0.0.20
+pytz==2025.2
+pyyaml==6.0.2
+pyzmq==26.4.0
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rich==14.0.0
+rpds-py==0.24.0
+rsa==4.9.1
+ruff==0.11.8
+safehttpx==0.1.6
+safetensors==0.5.3
+semantic-version==2.10.0
+send2trash==1.8.3
+sentencepiece==0.2.0
+setuptools==80.3.1
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+soupsieve==2.7
+stack-data==0.6.3
+starlette==0.46.2
+sympy==1.14.0
+terminado==0.18.1
+tinycss2==1.4.0
+tokenizers==0.21.1
+tomlkit==0.13.2
+torch==2.7.0
+tornado==6.4.2
+tqdm==4.67.1
+traitlets==5.14.3
+transformers==4.51.3
+typer==0.15.3
+types-python-dateutil==2.9.0.20241206
+typing-extensions==4.13.2
+typing-inspection==0.4.0
+tzdata==2025.2
+uri-template==1.3.0
+urllib3==2.4.0
+uvicorn==0.34.2
+wcwidth==0.2.13
+webcolors==24.11.1
+webencodings==0.5.1
+websocket-client==1.8.0
+websockets==15.0.1
+xxhash==3.5.0
+yarl==1.20.0

tools/schema_gemini.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+    "name": "extract_menu_data",
+    "description": "Extract structured menu information from images.",
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "restaurant": {
+                "type": "string",
+                "description": "Name of the restaurant. If the name is not available, it should be ''."
+            },
+            "address": {
+                "type": "string",
+                "description": "Address of the restaurant. If the address is not available, it should be ''."
+            },
+            "phone": {
+                "type": "string",
+                "description": "Phone number of the restaurant. If the phone number is not available, it should be ''."
+            },
+            "business_hours": {
+                "type": "string",
+                "description": "Business hours of the restaurant. If the business hours are not available, it should be ''."
+            },
+            "dishes": {
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "name": {
+                            "type": "string",
+                            "description": "Name of the menu item."
+                        },
+                        "price": {
+                            "type": "number",
+                            "format": "float",
+                            "description": "Price of the menu item. If the price is not available, it should be -1."
+                        }
+                    },
+                    "required": ["name", "price"]
+                },
+                "description": "List of menu dishes item."
+            }
+        },
+        "required": ["restaurant", "address", "phone", "business_hours", "dishes"]
+    }
+}

tools/schema_openai.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+    "type": "function",
+    "name": "extract_menu_data",
+    "description": "Extract structured menu information from images.",
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "restaurant": {
+                "type": "string",
+                "description": "Name of the restaurant. If the name is not available, it should be ''."
+            },
+            "address": {
+                "type": "string",
+                "description": "Address of the restaurant. If the address is not available, it should be ''."
+            },
+            "phone": {
+                "type": "string",
+                "description": "Phone number of the restaurant. If the phone number is not available, it should be ''."
+            },
+            "business_hours": {
+                "type": "string",
+                "description": "Business hours of the restaurant. If the business hours are not available, it should be ''."
+            },
+            "dishes": {
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "name": {
+                            "type": "string",
+                            "description": "Name of the menu item."
+                        },
+                        "price": {
+                            "type": "number",
+                            "format": "float",
+                            "description": "Price of the menu item. If the price is not available, it should be -1."
+                        }
+                    },
+                    "required": ["name", "price"],
+                    "additionalProperties": false
+                },
+                "description": "List of menu dishes item."
+            }
+        },
+        "required": ["restaurant", "address", "phone", "business_hours", "dishes"],
+        "additionalProperties": false
+    }
+}

train.ipynb ADDED Viewed

	@@ -0,0 +1,294 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Login to HuggingFace (just login once)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import interpreter_login\n",
+    "interpreter_login()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Collect Menu Image Datasets\n",
+    "- Use `metadata.jsonl` to label the images's ground truth. You can visit [here](https://github.com/ryanlinjui/menu-text-detection/tree/main/examples) to see the examples.\n",
+    "- After finishing, push to HuggingFace Datasets.\n",
+    "- For labeling:\n",
+    "    - [Google AI Studio](https://aistudio.google.com) or [OpenAI ChatGPT](https://chatgpt.com).\n",
+    "    - Use function calling by API. Start the gradio app locally or visit [here](https://huggingface.co/spaces/ryanlinjui/menu-text-detection).\n",
+    "\n",
+    "### Menu Type\n",
+    "- **h**: horizontal menu\n",
+    "- **v**: vertical menu\n",
+    "- **d**: document-style menu\n",
+    "- **s**: in-scene menu (non-document style)\n",
+    "- **i**: irregular menu (menu with irregular text layout)\n",
+    "\n",
+    "> Please see the [examples](https://github.com/ryanlinjui/menu-text-detection/tree/main/examples) for more details."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "dataset = load_dataset(path=\"datasets/menu-zh-TW\")      # load dataset from the local directory including the metadata.jsonl, images files.\n",
+    "dataset.push_to_hub(repo_id=\"ryanlinjui/menu-zh-TW\")    # push to the huggingface dataset hub"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Setup for Fine-tuning"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "from transformers import DonutProcessor, VisionEncoderDecoderModel, VisionEncoderDecoderConfig\n",
+    "\n",
+    "from menu.donut import DonutDatasets\n",
+    "\n",
+    "DATASETS_REPO_ID = \"ryanlinjui/menu-zh-TW\"              # set your dataset repo id for training\n",
+    "PRETRAINED_MODEL_REPO_ID = \"naver-clova-ix/donut-base\"  # set your pretrained model repo id for fine-tuning\n",
+    "TASK_PROMPT_NAME = \"<s_menu>\"                           # set your task prompt name for training\n",
+    "MAX_LENGTH = 768                                        # set your max length for maximum output length\n",
+    "IMAGE_SIZE = [1280, 960]                                # set your image size for training\n",
+    "\n",
+    "raw_datasets = load_dataset(DATASETS_REPO_ID)\n",
+    "\n",
+    "# Config: set the model config\n",
+    "config = VisionEncoderDecoderConfig.from_pretrained(PRETRAINED_MODEL_REPO_ID)\n",
+    "config.encoder.image_size = IMAGE_SIZE\n",
+    "config.decoder.max_length = MAX_LENGTH\n",
+    "\n",
+    "# Processor: use the processor to process the dataset. \n",
+    "# Convert the image to the tensor and the text to the token ids.\n",
+    "processor = DonutProcessor.from_pretrained(PRETRAINED_MODEL_REPO_ID)\n",
+    "processor.feature_extractor.size = IMAGE_SIZE[::-1]\n",
+    "processor.feature_extractor.do_align_long_axis = False\n",
+    "\n",
+    "# DonutDatasets: use the DonutDatasets to process the dataset.\n",
+    "# For model inpit, the image must be converted to the tensor and the json text must be converted to the token with the task prompt string.\n",
+    "# This example sets the column name by \"image\" and \"menu\". So that image file is included in the \"image\" column and the json text is included in the \"menu\" column.\n",
+    "datasets = DonutDatasets(\n",
+    "    datasets=raw_datasets,\n",
+    "    processor=processor,\n",
+    "    image_column=\"image\",\n",
+    "    annotation_column=\"menu\",\n",
+    "    task_start_token=TASK_PROMPT_NAME,\n",
+    "    prompt_end_token=TASK_PROMPT_NAME,\n",
+    "    train_split=0.8,\n",
+    "    validation_split=0.1,\n",
+    "    test_split=0.1,\n",
+    "    sort_json_key=True,\n",
+    "    seed=42\n",
+    ")\n",
+    "\n",
+    "# Model: load the pretrained model and set the config.\n",
+    "model = VisionEncoderDecoderModel.from_pretrained(PRETRAINED_MODEL_REPO_ID, config=config)\n",
+    "model.decoder.resize_token_embeddings(len(processor.tokenizer))\n",
+    "model.config.pad_token_id = processor.tokenizer.pad_token_id\n",
+    "model.config.decoder_start_token_id = processor.tokenizer.convert_tokens_to_ids([TASK_PROMPT_NAME])[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Start Fine-tuning"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments\n",
+    "\n",
+    "HUGGINGFACE_MODEL_ID = \"ryanlinjui/donut-base-finetuned-menu\" # set your huggingface model repo id for saving / pushing to the hub\n",
+    "EPOCHS = 100            # set your training epochs\n",
+    "TRAIN_BATCH_SIZE = 4    # set your training batch size\n",
+    "\n",
+    "device = (\n",
+    "    \"cuda\"\n",
+    "    if torch.cuda.is_available()\n",
+    "    else \"mps\" if torch.backends.mps.is_available() else \"cpu\"\n",
+    ")\n",
+    "print(f\"Using {device} device\")\n",
+    "model.to(device)\n",
+    "\n",
+    "training_args = Seq2SeqTrainingArguments(\n",
+    "    num_train_epochs=EPOCHS,\n",
+    "    per_device_train_batch_size=TRAIN_BATCH_SIZE,\n",
+    "    learning_rate=3e-5,\n",
+    "    per_device_eval_batch_size=1,\n",
+    "    output_dir=\"./.checkpoints\",\n",
+    "    seed=2022,\n",
+    "    warmup_steps=30,\n",
+    "    eval_strategy=\"steps\",\n",
+    "    eval_steps=100,\n",
+    "    logging_strategy=\"steps\",\n",
+    "    logging_steps=50,\n",
+    "    save_strategy=\"steps\",\n",
+    "    save_steps=200,\n",
+    "    push_to_hub=True if HUGGINGFACE_MODEL_ID else False,\n",
+    "    hub_model_id=HUGGINGFACE_MODEL_ID,\n",
+    "    hub_strategy=\"every_save\",\n",
+    "    report_to=\"tensorboard\",\n",
+    "    logging_dir=\"./.checkpoints/logs\",\n",
+    ")\n",
+    "trainer = Seq2SeqTrainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=datasets[\"train\"],\n",
+    "    eval_dataset=datasets[\"test\"],\n",
+    "    tokenizer=processor\n",
+    ")\n",
+    "\n",
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import (\n",
+    "    VisionEncoderDecoderModel,\n",
+    "    DonutProcessor,\n",
+    "    pipeline\n",
+    ")\n",
+    "from PIL import Image\n",
+    "\n",
+    "model_id = \"ryanlinjui/donut-base-finetuned-menu\"\n",
+    "\n",
+    "# 1. 下載並載入 model + processor\n",
+    "processor = DonutProcessor.from_pretrained(model_id)\n",
+    "model     = VisionEncoderDecoderModel.from_pretrained(model_id)\n",
+    "\n",
+    "# 2. 建立一個 image-to-text pipeline\n",
+    "ocr_pipeline = pipeline(\n",
+    "    \"image-to-text\",             # 使用 image-to-text 任務\n",
+    "    model=model,                 # 傳入已載入的 model\n",
+    "    tokenizer=processor.tokenizer,\n",
+    "    feature_extractor=processor.feature_extractor,\n",
+    ")\n",
+    "\n",
+    "# 3. 載入一張測試圖片\n",
+    "image = Image.open(\"./examples/menu-hd.jpg\")\n",
+    "\n",
+    "# 4. 呼叫 pipeline，取得結果\n",
+    "outputs = ocr_pipeline(image)\n",
+    "\n",
+    "# 5. 印出辨識文字\n",
+    "print(outputs[0][\"generated_text\"])\n",
+    "\n",
+    "'''\n",
+    "# test model\n",
+    "import re\n",
+    "\n",
+    "from transformers import VisionEncoderDecoderModel\n",
+    "from transformers import DonutProcessor\n",
+    "import torch\n",
+    "from PIL import Image\n",
+    "\n",
+    "image = Image.open(\"./examples/menu-hd.jpg\").convert(\"RGB\")\n",
+    "\n",
+    "processor = DonutProcessor.from_pretrained(\"ryanlinjui/donut-base-finetuned-menu\")\n",
+    "model = VisionEncoderDecoderModel.from_pretrained(\"ryanlinjui/donut-base-finetuned-menu\")\n",
+    "device = \"cuda\" if torch.cuda.is_available() else \"mps\"\n",
+    "\n",
+    "model.eval()\n",
+    "model.to(device)\n",
+    "\n",
+    "pixel_values = processor(image, return_tensors=\"pt\").pixel_values\n",
+    "pixel_values = pixel_values.to(device)\n",
+    "\n",
+    "task_prompt = \"<s_menu>\"\n",
+    "decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors=\"pt\").input_ids\n",
+    "decoder_input_ids = decoder_input_ids.to(device)\n",
+    "outputs = model.generate(\n",
+    "    pixel_values,\n",
+    "    decoder_input_ids=decoder_input_ids,\n",
+    "    max_length=model.decoder.config.max_position_embeddings,\n",
+    "    early_stopping=True,\n",
+    "    pad_token_id=processor.tokenizer.pad_token_id,\n",
+    "    eos_token_id=processor.tokenizer.eos_token_id,\n",
+    "    use_cache=True,\n",
+    "    num_beams=1,\n",
+    "    bad_words_ids=[[processor.tokenizer.unk_token_id]],\n",
+    "    return_dict_in_generate=True,\n",
+    ")\n",
+    "\n",
+    "seq = processor.batch_decode(outputs.sequences)[0]\n",
+    "seq = seq.replace(processor.tokenizer.eos_token, \"\").replace(processor.tokenizer.pad_token, \"\")\n",
+    "# seq = re.sub(r\"<.*?>\", \"\", seq, count=1).strip()  # remove first task start token\n",
+    "seq = processor.token2json(seq)\n",
+    "print(seq)\n",
+    "'''\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Plot the results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Training Loss\n",
+    "# Validation Normal ED per each epoch 1~0, 1 -> 0.22\n",
+    "# Test Accuracy TED Accuracy, F1 Score Accuracy 0.687058, 0.51119 "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff