Upload FlowerVLA model trained on CALVIN_ABCD

Files changed (3) hide show

README.md ADDED Viewed

+# FlowerVLA - Vision-Language-Action Flow Model for {dataset_name}
+    This is a pretrained FlowerVLA model for robotic manipulation trained on the {dataset_name} dataset. FlowerVLA is an efficient Vision-Language-Action Flow policy for robot learning.
+    ## Model Description
+    FlowerVLA is a novel architecture that:
+    - Uses Florence-2 for multi-modal vision-language encoding
+    - Employs a transformer-based flow matching architecture
+    - Provides an efficient policy with ~1B parameters
+    - Operates on action chunks for better long-horizon planning
+    ## Usage
+    ```python
+    from huggingface_hub import snapshot_download
+    import torch
+    import hydra
+    from omegaconf import OmegaConf
+    import json
+    import os
+    model_path = snapshot_download(repo_id="{repo_id}")
+    with open(os.path.join(model_path, "config.json")) as f:
+        config = json.load(f)
+    model_cfg = OmegaConf.create(config["model_config"])
+    model_cfg["_target_"] = "flower.models.flower.FLOWERVLA"
+    model = hydra.utils.instantiate(model_cfg)
+    state_dict = torch.load(os.path.join(model_path, "model.pt"))
+    model.load_state_dict(state_dict)
+    model.eval()
+    # obs = {...}  # Your observation dict
+    # goal = {"lang_text": "push the blue block to the right"}
+    # action = model.step(obs, goal)
+    @inproceedings{
+        reuss2024multimodal,
+        # Add citation when available
+    }

config.json ADDED Viewed

+{
+  "model_config": {
+    "vlm_path": "microsoft/Florence-2-large",
+    "freeze_florence": false,
+    "freeze_vision_tower": false,
+    "vlm_prompt_style": "default",
+    "token_dropout": 0.1,
+    "multistep": 10,
+    "num_sampling_steps": 4,
+    "lowdim_obs_dim": 7,
+    "action_dim": 7,
+    "act_window_size": 10,
+    "use_second_view": true,
+    "second_view_key": "image_wrist",
+    "action_type_adaln": true,
+    "use_causal_attention": true,
+    "use_cross_attn": true,
+    "sampling_type": "uniform",
+    "dit_dim": 1024,
+    "n_heads": 16,
+    "n_layers": 18,
+    "use_rope": true,
+    "query_seq_len": 100,
+    "rope_theta": 32.0
+  }
+}

model.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:60b3f38dee37662f43064b6acb0c6a864b52657ae96879fc31ab08228d1573ed
+size 4000241052