mbreuss commited on
Commit
8424c4a
·
verified ·
1 Parent(s): ab086ab

Upload FlowerVLA model trained on CALVIN_ABCD

Browse files
Files changed (3) hide show
  1. README.md +46 -0
  2. config.json +26 -0
  3. model.safetensors +3 -0
README.md ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FlowerVLA - Vision-Language-Action Flow Model for {dataset_name}
2
+
3
+ This is a pretrained FlowerVLA model for robotic manipulation trained on the {dataset_name} dataset. FlowerVLA is an efficient Vision-Language-Action Flow policy for robot learning.
4
+
5
+ ## Model Description
6
+
7
+ FlowerVLA is a novel architecture that:
8
+ - Uses Florence-2 for multi-modal vision-language encoding
9
+ - Employs a transformer-based flow matching architecture
10
+ - Provides an efficient policy with ~1B parameters
11
+ - Operates on action chunks for better long-horizon planning
12
+
13
+ ## Usage
14
+
15
+ ```python
16
+ from huggingface_hub import snapshot_download
17
+ import torch
18
+ import hydra
19
+ from omegaconf import OmegaConf
20
+ import json
21
+ import os
22
+
23
+ model_path = snapshot_download(repo_id="{repo_id}")
24
+
25
+ with open(os.path.join(model_path, "config.json")) as f:
26
+ config = json.load(f)
27
+
28
+ model_cfg = OmegaConf.create(config["model_config"])
29
+ model_cfg["_target_"] = "flower.models.flower.FLOWERVLA"
30
+
31
+ model = hydra.utils.instantiate(model_cfg)
32
+
33
+ state_dict = torch.load(os.path.join(model_path, "model.pt"))
34
+ model.load_state_dict(state_dict)
35
+
36
+ model.eval()
37
+
38
+ # obs = {...} # Your observation dict
39
+ # goal = {"lang_text": "push the blue block to the right"}
40
+ # action = model.step(obs, goal)
41
+
42
+ @inproceedings{
43
+ reuss2024multimodal,
44
+ # Add citation when available
45
+ }
46
+
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_config": {
3
+ "vlm_path": "microsoft/Florence-2-large",
4
+ "freeze_florence": false,
5
+ "freeze_vision_tower": false,
6
+ "vlm_prompt_style": "default",
7
+ "token_dropout": 0.1,
8
+ "multistep": 10,
9
+ "num_sampling_steps": 4,
10
+ "lowdim_obs_dim": 7,
11
+ "action_dim": 7,
12
+ "act_window_size": 10,
13
+ "use_second_view": true,
14
+ "second_view_key": "image_wrist",
15
+ "action_type_adaln": true,
16
+ "use_causal_attention": true,
17
+ "use_cross_attn": true,
18
+ "sampling_type": "uniform",
19
+ "dit_dim": 1024,
20
+ "n_heads": 16,
21
+ "n_layers": 18,
22
+ "use_rope": true,
23
+ "query_seq_len": 100,
24
+ "rope_theta": 32.0
25
+ }
26
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60b3f38dee37662f43064b6acb0c6a864b52657ae96879fc31ab08228d1573ed
3
+ size 4000241052