Upload FlowerVLA model trained on CALVIN_ABCD
Browse files- README.md +46 -0
- config.json +26 -0
- model.safetensors +3 -0
README.md
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# FlowerVLA - Vision-Language-Action Flow Model for {dataset_name}
|
2 |
+
|
3 |
+
This is a pretrained FlowerVLA model for robotic manipulation trained on the {dataset_name} dataset. FlowerVLA is an efficient Vision-Language-Action Flow policy for robot learning.
|
4 |
+
|
5 |
+
## Model Description
|
6 |
+
|
7 |
+
FlowerVLA is a novel architecture that:
|
8 |
+
- Uses Florence-2 for multi-modal vision-language encoding
|
9 |
+
- Employs a transformer-based flow matching architecture
|
10 |
+
- Provides an efficient policy with ~1B parameters
|
11 |
+
- Operates on action chunks for better long-horizon planning
|
12 |
+
|
13 |
+
## Usage
|
14 |
+
|
15 |
+
```python
|
16 |
+
from huggingface_hub import snapshot_download
|
17 |
+
import torch
|
18 |
+
import hydra
|
19 |
+
from omegaconf import OmegaConf
|
20 |
+
import json
|
21 |
+
import os
|
22 |
+
|
23 |
+
model_path = snapshot_download(repo_id="{repo_id}")
|
24 |
+
|
25 |
+
with open(os.path.join(model_path, "config.json")) as f:
|
26 |
+
config = json.load(f)
|
27 |
+
|
28 |
+
model_cfg = OmegaConf.create(config["model_config"])
|
29 |
+
model_cfg["_target_"] = "flower.models.flower.FLOWERVLA"
|
30 |
+
|
31 |
+
model = hydra.utils.instantiate(model_cfg)
|
32 |
+
|
33 |
+
state_dict = torch.load(os.path.join(model_path, "model.pt"))
|
34 |
+
model.load_state_dict(state_dict)
|
35 |
+
|
36 |
+
model.eval()
|
37 |
+
|
38 |
+
# obs = {...} # Your observation dict
|
39 |
+
# goal = {"lang_text": "push the blue block to the right"}
|
40 |
+
# action = model.step(obs, goal)
|
41 |
+
|
42 |
+
@inproceedings{
|
43 |
+
reuss2024multimodal,
|
44 |
+
# Add citation when available
|
45 |
+
}
|
46 |
+
|
config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_config": {
|
3 |
+
"vlm_path": "microsoft/Florence-2-large",
|
4 |
+
"freeze_florence": false,
|
5 |
+
"freeze_vision_tower": false,
|
6 |
+
"vlm_prompt_style": "default",
|
7 |
+
"token_dropout": 0.1,
|
8 |
+
"multistep": 10,
|
9 |
+
"num_sampling_steps": 4,
|
10 |
+
"lowdim_obs_dim": 7,
|
11 |
+
"action_dim": 7,
|
12 |
+
"act_window_size": 10,
|
13 |
+
"use_second_view": true,
|
14 |
+
"second_view_key": "image_wrist",
|
15 |
+
"action_type_adaln": true,
|
16 |
+
"use_causal_attention": true,
|
17 |
+
"use_cross_attn": true,
|
18 |
+
"sampling_type": "uniform",
|
19 |
+
"dit_dim": 1024,
|
20 |
+
"n_heads": 16,
|
21 |
+
"n_layers": 18,
|
22 |
+
"use_rope": true,
|
23 |
+
"query_seq_len": 100,
|
24 |
+
"rope_theta": 32.0
|
25 |
+
}
|
26 |
+
}
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:60b3f38dee37662f43064b6acb0c6a864b52657ae96879fc31ab08228d1573ed
|
3 |
+
size 4000241052
|