From b5f53d73721f05c4b0057403dc1744fb3d9a6046 Mon Sep 17 00:00:00 2001 From: ruiheng123 Date: Thu, 9 Oct 2025 09:46:29 +0800 Subject: [PATCH 1/6] New 3dpc branch --- .gitgnore => .gitignore | 7 +- LIBERO | 1 + calvin | 1 + eval.sh | 9 + eval2.sh | 6 + .../configs/modeling_prismatic.py | 64 +- prismatic/extern/hf/modeling_prismatic.py | 66 +- prismatic/models/action_heads.py | 19 +- .../llm/__pycache__/__init__.cpython-310.pyc | Bin 348 -> 353 bytes .../llm/__pycache__/base_llm.cpython-310.pyc | Bin 7582 -> 7587 bytes .../llm/__pycache__/llama2.cpython-310.pyc | Bin 3148 -> 3153 bytes .../llm/__pycache__/mistral.cpython-310.pyc | Bin 2486 -> 2491 bytes .../llm/__pycache__/phi.cpython-310.pyc | Bin 2260 -> 2265 bytes .../llm/__pycache__/qwen25.cpython-310.pyc | Bin 3039 -> 3044 bytes .../__pycache__/__init__.cpython-310.pyc | Bin 577 -> 582 bytes .../__pycache__/base_prompter.cpython-310.pyc | Bin 2900 -> 2905 bytes .../llama2_chat_prompter.cpython-310.pyc | Bin 2911 -> 2916 bytes .../mistral_instruct_prompter.cpython-310.pyc | Bin 2348 -> 2353 bytes .../__pycache__/phi_prompter.cpython-310.pyc | Bin 2298 -> 2303 bytes .../__pycache__/qwen_prompter.cpython-310.pyc | Bin 2562 -> 2567 bytes .../vicuna_v15_prompter.cpython-310.pyc | Bin 2617 -> 2622 bytes .../__pycache__/__init__.cpython-310.pyc | Bin 558 -> 563 bytes .../__pycache__/base_vision.cpython-310.pyc | Bin 10249 -> 10254 bytes .../__pycache__/clip_vit.cpython-310.pyc | Bin 1051 -> 1056 bytes .../__pycache__/dinoclip_vit.cpython-310.pyc | Bin 5687 -> 5692 bytes .../dinosiglip_vit.cpython-310.pyc | Bin 6191 -> 6196 bytes .../__pycache__/dinov2_vit.cpython-310.pyc | Bin 883 -> 888 bytes .../__pycache__/in1k_vit.cpython-310.pyc | Bin 962 -> 967 bytes .../__pycache__/siglip_vit.cpython-310.pyc | Bin 1109 -> 1114 bytes prismatic/models/load.py | 2 + .../models/pi3/models/dinov2/__init__.py | 6 + .../models/pi3/models/dinov2/hub/__init__.py | 4 + .../models/pi3/models/dinov2/hub/backbones.py | 156 ++++ .../models/pi3/models/dinov2/hub/utils.py | 39 + .../pi3/models/dinov2/layers/__init__.py | 11 + .../pi3/models/dinov2/layers/attention.py | 89 +++ .../models/pi3/models/dinov2/layers/block.py | 259 ++++++ .../pi3/models/dinov2/layers/dino_head.py | 58 ++ .../pi3/models/dinov2/layers/drop_path.py | 34 + .../pi3/models/dinov2/layers/layer_scale.py | 27 + .../models/pi3/models/dinov2/layers/mlp.py | 40 + .../pi3/models/dinov2/layers/patch_embed.py | 88 ++ .../pi3/models/dinov2/layers/swiglu_ffn.py | 72 ++ .../pi3/models/dinov2/models/__init__.py | 43 + .../dinov2/models/vision_transformer.py | 404 ++++++++++ .../pi3/models/dinov2/utils/__init__.py | 4 + .../models/pi3/models/dinov2/utils/cluster.py | 95 +++ .../models/pi3/models/dinov2/utils/config.py | 72 ++ .../models/pi3/models/dinov2/utils/dtype.py | 37 + .../pi3/models/dinov2/utils/param_groups.py | 103 +++ .../models/pi3/models/dinov2/utils/utils.py | 95 +++ .../models/pi3/models/layers/attention.py | 369 +++++++++ prismatic/models/pi3/models/layers/block.py | 406 ++++++++++ .../models/pi3/models/layers/camera_head.py | 93 +++ .../models/pi3/models/layers/pos_embed.py | 174 ++++ .../pi3/models/layers/transformer_head.py | 81 ++ prismatic/models/pi3/models/pi3.py | 216 +++++ prismatic/models/pi3/utils/basic.py | 223 ++++++ prismatic/models/pi3/utils/debug.py | 63 ++ prismatic/models/pi3/utils/geometry.py | 375 +++++++++ .../vlas/__pycache__/__init__.cpython-310.pyc | Bin 194 -> 199 bytes .../vlas/__pycache__/openvla.cpython-310.pyc | Bin 4936 -> 4941 bytes .../vlms/__pycache__/__init__.cpython-310.pyc | Bin 201 -> 206 bytes .../vlms/__pycache__/base_vlm.cpython-310.pyc | Bin 4693 -> 4698 bytes .../__pycache__/prismatic.cpython-310.pyc | Bin 15515 -> 15520 bytes .../__pycache__/__init__.cpython-310.pyc | Bin 297 -> 302 bytes .../__pycache__/base_strategy.cpython-310.pyc | Bin 9070 -> 9075 bytes .../__pycache__/ddp.cpython-310.pyc | Bin 4833 -> 4838 bytes .../__pycache__/fsdp.cpython-310.pyc | Bin 7823 -> 7828 bytes .../__pycache__/__init__.cpython-310.pyc | Bin 314 -> 287 bytes .../__pycache__/datasets.cpython-310.pyc | Bin 10812 -> 9218 bytes .../rlds/__pycache__/__init__.cpython-310.pyc | Bin 247 -> 252 bytes .../rlds/__pycache__/dataset.cpython-310.pyc | Bin 21906 -> 21911 bytes .../obs_transforms.cpython-310.pyc | Bin 3721 -> 3726 bytes .../traj_transforms.cpython-310.pyc | Bin 3403 -> 3408 bytes .../oxe/__pycache__/__init__.cpython-310.pyc | Bin 286 -> 291 bytes .../oxe/__pycache__/configs.cpython-310.pyc | Bin 10482 -> 10550 bytes .../__pycache__/materialize.cpython-310.pyc | Bin 4331 -> 4336 bytes .../oxe/__pycache__/mixtures.cpython-310.pyc | Bin 3325 -> 3330 bytes .../__pycache__/transforms.cpython-310.pyc | Bin 23438 -> 23443 bytes prismatic/vla/datasets/rlds/oxe/configs.py | 7 + .../__pycache__/droid_utils.cpython-310.pyc | Bin 4817 -> 4822 bytes .../__pycache__/__init__.cpython-310.pyc | Bin 167 -> 172 bytes .../__pycache__/data_utils.cpython-310.pyc | Bin 12141 -> 12146 bytes .../goal_relabeling.cpython-310.pyc | Bin 1181 -> 1186 bytes .../task_augmentation.cpython-310.pyc | Bin 1697 -> 1702 bytes run.sh | 35 + vla-scripts/finetune.py | 37 +- vla_adapter.egg-info/PKG-INFO | 754 ++++++++++++++++-- vla_adapter.egg-info/SOURCES.txt | 30 +- vla_adapter.egg-info/requires.txt | 2 +- vla_adapter.egg-info/top_level.txt | 3 + 92 files changed, 4681 insertions(+), 98 deletions(-) rename .gitgnore => .gitignore (95%) create mode 160000 LIBERO create mode 160000 calvin create mode 100644 eval.sh create mode 100644 eval2.sh create mode 100644 prismatic/models/pi3/models/dinov2/__init__.py create mode 100644 prismatic/models/pi3/models/dinov2/hub/__init__.py create mode 100644 prismatic/models/pi3/models/dinov2/hub/backbones.py create mode 100644 prismatic/models/pi3/models/dinov2/hub/utils.py create mode 100644 prismatic/models/pi3/models/dinov2/layers/__init__.py create mode 100644 prismatic/models/pi3/models/dinov2/layers/attention.py create mode 100644 prismatic/models/pi3/models/dinov2/layers/block.py create mode 100644 prismatic/models/pi3/models/dinov2/layers/dino_head.py create mode 100644 prismatic/models/pi3/models/dinov2/layers/drop_path.py create mode 100644 prismatic/models/pi3/models/dinov2/layers/layer_scale.py create mode 100644 prismatic/models/pi3/models/dinov2/layers/mlp.py create mode 100644 prismatic/models/pi3/models/dinov2/layers/patch_embed.py create mode 100644 prismatic/models/pi3/models/dinov2/layers/swiglu_ffn.py create mode 100644 prismatic/models/pi3/models/dinov2/models/__init__.py create mode 100644 prismatic/models/pi3/models/dinov2/models/vision_transformer.py create mode 100644 prismatic/models/pi3/models/dinov2/utils/__init__.py create mode 100644 prismatic/models/pi3/models/dinov2/utils/cluster.py create mode 100644 prismatic/models/pi3/models/dinov2/utils/config.py create mode 100644 prismatic/models/pi3/models/dinov2/utils/dtype.py create mode 100644 prismatic/models/pi3/models/dinov2/utils/param_groups.py create mode 100644 prismatic/models/pi3/models/dinov2/utils/utils.py create mode 100644 prismatic/models/pi3/models/layers/attention.py create mode 100644 prismatic/models/pi3/models/layers/block.py create mode 100644 prismatic/models/pi3/models/layers/camera_head.py create mode 100644 prismatic/models/pi3/models/layers/pos_embed.py create mode 100644 prismatic/models/pi3/models/layers/transformer_head.py create mode 100644 prismatic/models/pi3/models/pi3.py create mode 100644 prismatic/models/pi3/utils/basic.py create mode 100644 prismatic/models/pi3/utils/debug.py create mode 100644 prismatic/models/pi3/utils/geometry.py create mode 100644 run.sh diff --git a/.gitgnore b/.gitignore similarity index 95% rename from .gitgnore rename to .gitignore index 6160ebc..b71c432 100644 --- a/.gitgnore +++ b/.gitignore @@ -144,10 +144,15 @@ dmypy.json # Mac OS .DS_Store + # Caches and Datasets cache/ data/ - +pretrained_models/ # Rollout videos and wandb logs rollouts/ wandb/ +outputs/ +experiments/logs/ +evaluation_results/ +vla_adapter.egg-info/ diff --git a/LIBERO b/LIBERO new file mode 160000 index 0000000..8f1084e --- /dev/null +++ b/LIBERO @@ -0,0 +1 @@ +Subproject commit 8f1084e3132a39270c3a13ebe37270a43ece2a01 diff --git a/calvin b/calvin new file mode 160000 index 0000000..fa03f01 --- /dev/null +++ b/calvin @@ -0,0 +1 @@ +Subproject commit fa03f01f19c65920e18cf37398a9ce859274af76 diff --git a/eval.sh b/eval.sh new file mode 100644 index 0000000..0b4e84b --- /dev/null +++ b/eval.sh @@ -0,0 +1,9 @@ +CUDA_VISIBLE_DEVICES=3 python experiments/robot/libero/run_libero_eval.py \ + --use_proprio True \ + --num_images_in_input 2 \ + --use_film False \ + --pretrained_checkpoint outputs/LIBERO-Long-Pro \ + --task_suite_name libero_10 \ + --use_pro_version True \ +# outputs/configs+libero_10_no_noops+b16+lr-0.0002+lora-r64+dropout-0.0--image_aug--VLA-Adapter--libero_10_no_noops--1759126170--160000_chkpt \ +# > eval_logs/Spatial--chkpt.log 2>&1 & \ No newline at end of file diff --git a/eval2.sh b/eval2.sh new file mode 100644 index 0000000..8408a59 --- /dev/null +++ b/eval2.sh @@ -0,0 +1,6 @@ +export HF_HUB_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 +export HF_DATASETS_OFFLINE=1 +CUDA_VISIBLE_DEVICES=7 python vla-scripts/evaluate_calvin.py \ + --pretrained_checkpoint outputs/CALVIN-ABC-Pro + # --pretrained_checkpoint outputs/configs+calvin_abc_rlds+b16+lr-0.0002+lora-r64+dropout-0.0--image_aug--VLA-Adapter--calvin_abc_rlds----100000_chkpt \ \ No newline at end of file diff --git a/pretrained_models/configs/modeling_prismatic.py b/pretrained_models/configs/modeling_prismatic.py index 945d03e..968b95c 100644 --- a/pretrained_models/configs/modeling_prismatic.py +++ b/pretrained_models/configs/modeling_prismatic.py @@ -428,6 +428,14 @@ def _replace_input_embeddings(self, input_embeddings, all_actions_mask, noisy_ac Returns: Modified input_embeddings tensor """ + """ + * input_embeddings: [B, L_a + L_lang, Dim] + * all_actions_mask: [B, L_a + L_lang] + * noisy_action_features: [B, L_a, Dim] + * 此处其实是替换,我们 L_a + L_lang 这一串我们把 L_a 的部分,用 mask_indicies 索引从哪开始 L_a 这块 + * 我们 action_queries (论文核心设计)是 Embedding(num_tokens, dim) 的 weight + * 这一块是 [B, L_a + L_lang, Dim] 当中 L_a 替换成 action_queries 的 weight,L_lang 不动 + """ # Clone input to avoid modifying the original tensor new_input_embeddings = input_embeddings.clone() @@ -455,6 +463,15 @@ def _replace_input_embeddings(self, input_embeddings, all_actions_mask, noisy_ac def _process_action_masks(self, labels): """Helper to get action masks from labels""" + """ + * IGNORE_INDEX = -100, labels 中从第一个 -100 开始, + * ACTION_TOKEN_BEGIN_IDX = 151386 + * NUM_TOKENS = 64, action 有 64 个 token ,从而 labels 一般是 64 个非 -100 。 + * ACTION_DIM = 7,current_action 是 labels 里 前 6 个,next_actions 是 后 58 个 + * 两个 mask 都是 Boolean。因此 1-48 是 -100, 49 - 54 是 curr_action, 55 - 110 是 next_actions, 后面都是 -100。 + * 因而 all_action_mask 其实就是 [B, L] 这里 每一个 sample 中 64 个是 True,表示第几个 token 是 action 的。 + * action 部分的 64 个就是 True。余下的是 False + """ current_action_mask = get_current_action_mask(labels) next_actions_mask = get_next_actions_mask(labels) all_actions_mask = current_action_mask | next_actions_mask # (B, seq_len) @@ -462,6 +479,10 @@ def _process_action_masks(self, labels): def _process_vision_features(self, pixel_values, language_embeddings=None, use_film=False): """Process vision features with optional FiLM conditioning""" + """ + * 原设置没有 film condition,因此 language 的 feature embedding 不会传入给 vision transformer。 + * [B, 3 * num_images, H, W] --(vision)--> [B, 256 * num_images, D] --(projector)--> [B, 256 * num_images, llm_dim] + """ if use_film: # FiLM: Infuse language inputs into visual features patch_features = self.vision_backbone(pixel_values, language_embeddings) # (bsz, 256 * num_images, D) @@ -473,6 +494,11 @@ def _process_vision_features(self, pixel_values, language_embeddings=None, use_f def _process_proprio_features(self, projected_patch_embeddings, proprio, proprio_projector): """Process proprioceptive features and append to vision features""" + """ + * 将 proprio 投影到 [B, D] 的 vector,然后 [B, 1, D] + * 然后 append 到尾部 + * 实际上没有使用。 + """ if proprio_projector is not None and proprio is not None: # projected_patch_embeddings: (bsz, num_patches * num_images, llm_dim) # proprio: (bsz, proprio_dim) or (propro_dim,) @@ -486,7 +512,13 @@ def _process_proprio_features(self, projected_patch_embeddings, proprio, proprio def _build_multimodal_attention(self, input_embeddings, projected_patch_embeddings, attention_mask): """Build multimodal embeddings and attention mask""" # Update attention mask - + """ + * 这里 input_embedding 中 L_a 的部分已经被替换为 nn.Embedding 的 weight了。 + * 其实就是 input_embed 和 mask 在 length 上和 vision 的 embed 里 cat + * multimodal_embeddings: [B, 1 + L_v + (L_a + L_lang -1), Dim] 注意这个 1 是 token。L_v 被插在了这二者之间了。 + * multimodal_attention_mask: [B, 1 + L_v + (L_a + L_lang -1)]。 + * vision 部分的 mask [B, L_v] 是 全 True 的。 + """ projected_patch_attention_mask = None if attention_mask is not None: projected_patch_attention_mask = torch.full( @@ -511,6 +543,7 @@ def _build_multimodal_attention(self, input_embeddings, projected_patch_embeddin def _build_multimodal_labels(self, labels, projected_patch_embeddings): """Build multimodal labels with IGNORE_INDEX for patch embeddings""" + #* 所有 vision 部分的 index 都标为 -100(非 action 的 label),然后和原来 label [B, 1 + L_v + (L_a + L_lang -1)] 拼接 if labels is not None: projected_patch_labels = torch.full( (projected_patch_embeddings.shape[0], projected_patch_embeddings.shape[1]), @@ -543,6 +576,22 @@ def forward( use_film: bool = False, ) -> Union[Tuple, PrismaticCausalLMOutputWithPast]: """Run a forward pass through the VLM, returning a PrismaticCausalLMOutputWithPast instance.""" + """ + * Debug NOTE: + * input_ids has shape: [B, 120] with dtype: torch.int64 + ^ input_ids: + * attention_mask has shape: [B, 120] with dtype: torch.bool + ^ attention_mask [torch.where(~m)[0].tolist() for m in attention_mask] + ^ [[119], [109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119], [114, 115, 116, 117, 118, 119], ...] + + * pixel_values has shape: [B, 12, 224, 224] with dtype: torch.float32 + * labels has shape: [B, 120] with dtype: torch.int64 + ^ [(r[0].item(), r[-1].item()) if len(r:=torch.where(l!=-100)[0]) else (None,None) for l in labels] + ^ -100 一段 --> 非 -100 --> -100 一段 + ^ [(54, 118), (44, 108), (49, 113), (48, 112), (50, 114), (44, 108), (49, 113), (55, 119)] + + * proprio has shape: [B, 8] with dtype: torch.float32 + """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -595,6 +644,10 @@ def forward( # === Handle Multimodal Forward === elif (input_ids.shape[0] == pixel_values.shape[0]) or (inputs_embeds.shape[0] == pixel_values.shape[0]): + + #! Entered here! + #* input_ids: [B, L_a+L_lang](int64) --(embedding)--> [B, L_a+L_lang, Dim](bfloat16) where 120 is the sequence len. + #* non -100 labels are acion tokens. assert past_key_values is None, "Unexpected key `past_key_values` provided during multimodal forward!" # Get input embeddings (from language model embeddings) @@ -604,6 +657,11 @@ def forward( # Extract action masks all_actions_mask = self._process_action_masks(labels) + #* labels 有 64 个 非 -100 的 id,mask 也就是对应 64 个 位置是 True。这里也就是 labels 非 -100 的位置对应 True,说明是 action token + #* input_embeddings: [B, L_a + L_lang, Dim] + #* all_actions_mask 定位 L_a 起始终止 index。 + #* language_embeddings: [B, L_lang, Dim] + #* projected_patch_embeddings: [B, L_vis, Dim] # Extract the language portion of the input embeddings (i.e. remove the action tokens portion) # print(input_embeddings[~all_actions_mask].size()) @@ -639,6 +697,10 @@ def forward( # Build labels for multimodal sequence if needed multimodal_labels = self._build_multimodal_labels(labels, projected_patch_embeddings) + + #* multimodal_embeddings: [B, 1 + L_vis + (L_a + L_lang -1), Dim] + #* multimodal_attention_mask: [B, 1 + L_vis + (L_a + L_lang -1)] + #* mask 在 L_vis 和 L_a 为 True,余下为 False,这其实是说 Langugae 部分是 Causal 而 action,vis 是 bidirectional。 # Dispatch to language model language_model_output = self.language_model( diff --git a/prismatic/extern/hf/modeling_prismatic.py b/prismatic/extern/hf/modeling_prismatic.py index 945d03e..17fb6a4 100644 --- a/prismatic/extern/hf/modeling_prismatic.py +++ b/prismatic/extern/hf/modeling_prismatic.py @@ -428,6 +428,14 @@ def _replace_input_embeddings(self, input_embeddings, all_actions_mask, noisy_ac Returns: Modified input_embeddings tensor """ + """ + * input_embeddings: [B, L_a + L_lang, Dim] + * all_actions_mask: [B, L_a + L_lang] + * noisy_action_features: [B, L_a, Dim] + * 此处其实是替换,我们 L_a + L_lang 这一串我们把 L_a 的部分,用 mask_indicies 索引从哪开始 L_a 这块 + * 我们 action_queries (论文核心设计)是 Embedding(num_tokens, dim) 的 weight + * 这一块是 [B, L_a + L_lang, Dim] 当中 L_a 替换成 action_queries 的 weight,L_lang 不动 + """ # Clone input to avoid modifying the original tensor new_input_embeddings = input_embeddings.clone() @@ -455,6 +463,15 @@ def _replace_input_embeddings(self, input_embeddings, all_actions_mask, noisy_ac def _process_action_masks(self, labels): """Helper to get action masks from labels""" + """ + * IGNORE_INDEX = -100, labels 中从第一个 -100 开始, + * ACTION_TOKEN_BEGIN_IDX = 151386 + * NUM_TOKENS = 64, action 有 64 个 token ,从而 labels 一般是 64 个非 -100 。 + * ACTION_DIM = 7,current_action 是 labels 里 前 6 个,next_actions 是 后 58 个 + * 两个 mask 都是 Boolean。因此 1-48 是 -100, 49 - 54 是 curr_action, 55 - 110 是 next_actions, 后面都是 -100。 + * 因而 all_action_mask 其实就是 [B, L] 这里 每一个 sample 中 64 个是 True,表示第几个 token 是 action 的。 + * action 部分的 64 个就是 True。余下的是 False + """ current_action_mask = get_current_action_mask(labels) next_actions_mask = get_next_actions_mask(labels) all_actions_mask = current_action_mask | next_actions_mask # (B, seq_len) @@ -462,6 +479,10 @@ def _process_action_masks(self, labels): def _process_vision_features(self, pixel_values, language_embeddings=None, use_film=False): """Process vision features with optional FiLM conditioning""" + """ + * 原设置没有 film condition,因此 language 的 feature embedding 不会传入给 vision transformer。 + * [B, 3 * num_images, H, W] --(vision)--> [B, 256 * num_images, D] --(projector)--> [B, 256 * num_images, llm_dim] + """ if use_film: # FiLM: Infuse language inputs into visual features patch_features = self.vision_backbone(pixel_values, language_embeddings) # (bsz, 256 * num_images, D) @@ -473,6 +494,11 @@ def _process_vision_features(self, pixel_values, language_embeddings=None, use_f def _process_proprio_features(self, projected_patch_embeddings, proprio, proprio_projector): """Process proprioceptive features and append to vision features""" + """ + * 将 proprio 投影到 [B, D] 的 vector,然后 [B, 1, D] + * 然后 append 到尾部 + * 实际上没有使用。 + """ if proprio_projector is not None and proprio is not None: # projected_patch_embeddings: (bsz, num_patches * num_images, llm_dim) # proprio: (bsz, proprio_dim) or (propro_dim,) @@ -486,7 +512,13 @@ def _process_proprio_features(self, projected_patch_embeddings, proprio, proprio def _build_multimodal_attention(self, input_embeddings, projected_patch_embeddings, attention_mask): """Build multimodal embeddings and attention mask""" # Update attention mask - + """ + * 这里 input_embedding 中 L_a 的部分已经被替换为 nn.Embedding 的 weight了。 + * 其实就是 input_embed 和 mask 在 length 上和 vision 的 embed 里 cat + * multimodal_embeddings: [B, 1 + L_v + (L_a + L_lang -1), Dim] 注意这个 1 是 token。L_v 被插在了这二者之间了。 + * multimodal_attention_mask: [B, 1 + L_v + (L_a + L_lang -1)]。 + * vision 部分的 mask [B, L_v] 是 全 True 的。 + """ projected_patch_attention_mask = None if attention_mask is not None: projected_patch_attention_mask = torch.full( @@ -511,6 +543,7 @@ def _build_multimodal_attention(self, input_embeddings, projected_patch_embeddin def _build_multimodal_labels(self, labels, projected_patch_embeddings): """Build multimodal labels with IGNORE_INDEX for patch embeddings""" + #* 所有 vision 部分的 index 都标为 -100(非 action 的 label),然后和原来 label [B, 1 + L_v + (L_a + L_lang -1)] 拼接 if labels is not None: projected_patch_labels = torch.full( (projected_patch_embeddings.shape[0], projected_patch_embeddings.shape[1]), @@ -543,6 +576,22 @@ def forward( use_film: bool = False, ) -> Union[Tuple, PrismaticCausalLMOutputWithPast]: """Run a forward pass through the VLM, returning a PrismaticCausalLMOutputWithPast instance.""" + """ + * Debug NOTE: + * input_ids has shape: [B, 120] with dtype: torch.int64 + ^ input_ids: + * attention_mask has shape: [B, 120] with dtype: torch.bool + ^ attention_mask [torch.where(~m)[0].tolist() for m in attention_mask] + ^ [[119], [109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119], [114, 115, 116, 117, 118, 119], ...] + + * pixel_values has shape: [B, 12, 224, 224] with dtype: torch.float32 + * labels has shape: [B, 120] with dtype: torch.int64 + ^ [(r[0].item(), r[-1].item()) if len(r:=torch.where(l!=-100)[0]) else (None,None) for l in labels] + ^ -100 一段 --> 非 -100 --> -100 一段 + ^ [(54, 118), (44, 108), (49, 113), (48, 112), (50, 114), (44, 108), (49, 113), (55, 119)] + + * proprio has shape: [B, 8] with dtype: torch.float32 + """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -595,6 +644,10 @@ def forward( # === Handle Multimodal Forward === elif (input_ids.shape[0] == pixel_values.shape[0]) or (inputs_embeds.shape[0] == pixel_values.shape[0]): + + #! Entered here! + #* input_ids: [B, L_a+L_lang](int64) --(embedding)--> [B, L_a+L_lang, Dim](bfloat16) where 120 is the sequence len. + #* non -100 labels are acion tokens. assert past_key_values is None, "Unexpected key `past_key_values` provided during multimodal forward!" # Get input embeddings (from language model embeddings) @@ -604,13 +657,18 @@ def forward( # Extract action masks all_actions_mask = self._process_action_masks(labels) + #* labels 有 64 个 非 -100 的 id,mask 也就是对应 64 个 位置是 True。这里也就是 labels 非 -100 的位置对应 True,说明是 action token + #* input_embeddings: [B, L_a + L_lang, Dim] + #* all_actions_mask 定位 L_a 起始终止 index。 + #* language_embeddings: [B, L_lang, Dim] + #* projected_patch_embeddings: [B, L_vis, Dim] # Extract the language portion of the input embeddings (i.e. remove the action tokens portion) # print(input_embeddings[~all_actions_mask].size()) language_embeddings = input_embeddings[~all_actions_mask].reshape( input_embeddings.shape[0], -1, input_embeddings.shape[2] ) # (B, lang_seq_len, llm_dim) - + # Get visual features projected_patch_embeddings = self._process_vision_features(pixel_values, language_embeddings, use_film) @@ -639,6 +697,10 @@ def forward( # Build labels for multimodal sequence if needed multimodal_labels = self._build_multimodal_labels(labels, projected_patch_embeddings) + + #* multimodal_embeddings: [B, 1 + L_vis + (L_a + L_lang -1), Dim] + #* multimodal_attention_mask: [B, 1 + L_vis + (L_a + L_lang -1)] + #* mask 在 L_vis 和 L_a 为 True,余下为 False,这其实是说 Langugae 部分是 Causal 而 action,vis 是 bidirectional。 # Dispatch to language model language_model_output = self.language_model( diff --git a/prismatic/models/action_heads.py b/prismatic/models/action_heads.py index 6719c96..5f24d66 100644 --- a/prismatic/models/action_heads.py +++ b/prismatic/models/action_heads.py @@ -47,6 +47,14 @@ def predict_action( proprio_projector=None, phase="Inference" ): + """ + * action_hidden_states: [B, Hidden, L_v + L_a, Dim] + * proprio_hidden_states: + * proprio_projector: [B, P_dim] --> [B, 1, Dim] + * 输出时:task_hidden_states: [B, Hidden, L_v, Dim], action_hidden_states: [B, Hidden, L_a, Dim] + * cond_actions_hidden_states: [B, A_dim * A_chunk, Dim] --(reshape)-- [B, A_chunk, A_dim * Dim] + * 这 rearranged_actions_hidden_states 是 Learnable PE + """ batch_size = actions_hidden_states.shape[0] device = actions_hidden_states.device @@ -110,7 +118,8 @@ def __init__( def forward(self, x, h_a=None, h_t=None, p= None): - + #* [B, A_chunk, A_dim * Dim] -> [B, A_chunk, Dim] -> [B, A_chunk, A_dim] + #* 每一个 block 内部的过程是: # x: (batch_size, input_dim) x = self.layer_norm1(x) # shape: (batch_size, input_dim) x = self.fc1(x) # shape: (batch_size, hidden_dim) @@ -340,6 +349,14 @@ def forward(self, x, h_a=None, h_t=None, p=None): h_a: adapter tokens h_t: task tokens p: possible conditioning vector (for FiLM) + * x: [B, A_chunk, Dim] + * h_a: [B, L_a, Dim] + * h_t: [B, L_v, Dim] + * p: [B, 1, Dim] + * 三种:[B, n, A_chunk, dim], [B, n, L_a + p, dim], [B, n, L_v, dim] MHA 方式,加入 RoPE + * [B, n, A_chunk, dim] 的 q 和 自身的 k、h_t 的 k、h_a 的 k 分别做点积,得到三个 + * [B, n, A_chunk, A_chunk], [B, n, A_chunk, L_a + p], [B, n, A_chunk, L_v] , cat 就是 [B, n, A_chunk, A_chunk + (L_a + p) + L_v] + * 而 v 三者 cat 在一起就是 [B, n, A_chunk + (L_a + p) + L_v, dim] --> [B, n, A_chunk, dim] """ g = self.gating_factor ratio_g = torch.tanh(g) diff --git a/prismatic/models/backbones/llm/__pycache__/__init__.cpython-310.pyc b/prismatic/models/backbones/llm/__pycache__/__init__.cpython-310.pyc index 1ee1bfcce482b99b730cb1d41c75724686a39ca4..328b2ce43dd80d11c1ab32db9b20b917acee6762 100644 GIT binary patch delta 58 zcmcb^^pJ@=pO=@50SLU-U6{!IPRU0k}C;$Ke delta 53 zcmaFJbccyMpO=@50SLq|pPR`2PS#z&C_leMKe4nVKP5-EB)336zaTZQEGJPnKdof4 HG@}~;$9)mL diff --git a/prismatic/models/backbones/llm/__pycache__/base_llm.cpython-310.pyc b/prismatic/models/backbones/llm/__pycache__/base_llm.cpython-310.pyc index 704d3391419f23ecb69e57c4fb5a74b417fd7e1c..fd0ca7f6517d26e04048ef0d985358c0b4d94cbb 100644 GIT binary patch delta 200 zcmbPdz1W&NpO=@50SLU-UD(Ji$*$z1pOK%Ns$W!^nUR{8UY?kju3wgus2}FzsOy-L zSWuE$wAqpU2TuY=K}M#ok*;x4l|W&6YMzm)u0gzMl5Rn1QEHVSvPf!0Nl_wPks(|W zA5@V!R1=JE4B>B{DWb;6cx&?k(F;tBwhJ}A#PNw=W1D76a4H;3|Jstw{zp&}dw8JW69y2eSH x=ZUB>GDdAaDtdv5DN1&-w}kBEEfQ0JBA$|3%#2Z!OJtmZjaZ!RhpO=@50SLU-UD(LIn^nn6KO;XkRlle-Gb1%Gy*x25UB4_RQ9sPbQP(jg yv7jWiX!B#%b%;Hfzskuq1De)!w*{ONOx7czLD^rV#qb9HA{Lg5$c`erxMgVPqAlm={ diff --git a/prismatic/models/backbones/llm/__pycache__/mistral.cpython-310.pyc b/prismatic/models/backbones/llm/__pycache__/mistral.cpython-310.pyc index d7ad9c74ca576dfd6c16c01c8a64e9442bde0c46..87082ed25d728505464669cafaf6044c0333958b 100644 GIT binary patch delta 66 zcmdlcyjz$%pO=@50SLU-UD(Kdl10f|KO;XkRlle-Gb1%Gy*x25UB4_RQ9sPbQP(jg Vv7jWiX!A!FK^DealbJa;0|2!x74`rC delta 61 zcmdljyiJ%ppO=@50SJUIpWDcNl10`{zbHSyL_e{#BtIobwDM0Hu-@&Hw-a delta 61 zcmca9ctwyqpO=@50SJUIpWDc-!y@aXUzDF;qMuk=lAn^JTasI#pI?xgSC*5go1a#) PIg;fvGh@_bU5+&XK5rA{ diff --git a/prismatic/models/backbones/llm/__pycache__/qwen25.cpython-310.pyc b/prismatic/models/backbones/llm/__pycache__/qwen25.cpython-310.pyc index e60b6983244d998fa4414c1e65eb2ef61ff58c6c..56835870c0f793b0950731dced1f172392a7cacb 100644 GIT binary patch delta 77 zcmcaF{zRNRpO=@50SLU-UD(L|gGI?pKO;XkRlle-Gb1%Gy*x25UB4_RQ9sPbQP(jg gv7jWiXtM(AK32wClkaosGTkzp{D4b-vM%=~0Ox8N-T(jq delta 81 zcmaDNeqWqBpO=@50SLq|pWDd&gGJU=zbHSyL_e{#BtIobwb%7 diff --git a/prismatic/models/backbones/llm/prompting/__pycache__/__init__.cpython-310.pyc b/prismatic/models/backbones/llm/prompting/__pycache__/__init__.cpython-310.pyc index 9db5d2900cdd31d84f65a2ae96476ad64bae2cd6..1290d8ce8cf7a06076947df35acd953194be844e 100644 GIT binary patch delta 59 zcmX@ea*TyLpO=@50SLU-UD(Kdlu;>6KO;XkRlle-Gb1%Gy*x25UB4_RQ9sPbQP(jg Nv7jWiX!2Xe5C9?36Wss+ delta 54 zcmX@ca*%~PpO=@50SH7dpWDcNlur8w2RRA=QCt*p diff --git a/prismatic/models/backbones/llm/prompting/__pycache__/phi_prompter.cpython-310.pyc b/prismatic/models/backbones/llm/prompting/__pycache__/phi_prompter.cpython-310.pyc index 180d54604105c1dab836f265f458472f464595aa..53bf3e7243c44093fa3e1ac0c3b21e93623611cd 100644 GIT binary patch delta 66 zcmew*_+OAapO=@50SLU-UD(L|ok=N5KO;XkRlle-Gb1%Gy*x25UB4_RQ9sPbQP(jg Uv7jWiXtO+X7Axbe$z2=;0Ns2Re*gdg delta 61 zcmew__)CyGpO=@50SH7cpWDd&ok=!SzbHSyL_e{#BtIobwSjQ&&bbB)h{Z|%t*~kFHg)%*DuRS)DQD<)OAcr UEGS7W+WeYnE*s;m$(@{)0HO~TY5)KL delta 61 zcmZn{X%gYi=jG*M00NQA=QeU5W|9rlFUrp^(N8Qb$xq4AEy*p=&o4;LE6YjL%}*=Y P{DEmM8)MYuKF&%2I^`6C diff --git a/prismatic/models/backbones/llm/prompting/__pycache__/vicuna_v15_prompter.cpython-310.pyc b/prismatic/models/backbones/llm/prompting/__pycache__/vicuna_v15_prompter.cpython-310.pyc index 52a31928c5e192e83b152a0f3eafae4498cb231b..66edffe7a54de8075705125c40f9f3b031c4a75b 100644 GIT binary patch delta 66 zcmdlfvQLCNpO=@50SLU-UD(K7!K{?1pOK%Ns$W!^nUR{8UY?kju3wgus2}FzsOy-L USWuE$w0QxuEF0sk$@@5~0jlN}0RR91 delta 61 zcmdldvQvaRpO=@50SH7dpWDb?!7Ll2UzDF;qMuk=lAn^JTasI#pI?xgSC*5go1a#) Pc`36j8)MYu!<^LsLmCts diff --git a/prismatic/models/backbones/vision/__pycache__/__init__.cpython-310.pyc b/prismatic/models/backbones/vision/__pycache__/__init__.cpython-310.pyc index 28c8c5660fb1a57583a8b3216ce4ee94fbf19e56..0e29c88d679c813560eaec3548446bca571468e8 100644 GIT binary patch delta 59 zcmZ3-vYCZDpO=@50SLU-UD(LIol(hOKO;XkRlle-Gb1%Gy*x25UB4_RQ9sPbQP(jg Nv7jWiX!3nVe*hJ}6ORA@ delta 54 zcmdnYvW|s2pO=@50SKfnpWDd2ol(|HzbHSyL_e{#BtIobw+(K(bs^5lGGyod*Em?;0ln delta 73 zcmdm^vt5TfpO=@50SF{7pWDd&nOW9PzbHSyL_e{#BtIobwpO=@50SF{7pWDbC!y+4?UzDF;qMuk=lAn^JTasI#pI?xgSC*5go1a#) Yxs#<^h%suisCX!l%obM!lD*=~0Z3vOegFUf diff --git a/prismatic/models/backbones/vision/__pycache__/dinov2_vit.cpython-310.pyc b/prismatic/models/backbones/vision/__pycache__/dinov2_vit.cpython-310.pyc index fbec061c47047cd085a31efed9cb227aea184fa7..c47c3d03cf0d23ae4ead7f6dd97f0336a96f5914 100644 GIT binary patch delta 65 zcmey&_JfT(pO=@50SLU-UD(L|jZrC3KO;XkRlle-Gb1%Gy*x25UB4_RQ9sPbQP(jg Tv7jWiXtEsB3&vZM*_m?yzXlc? delta 60 zcmeyt_L+@4pO=@50SF{7pWDd&jZxM|zbHSyL_e{#BtIobwP#;D0W%sBv6jubcm diff --git a/prismatic/models/backbones/vision/__pycache__/in1k_vit.cpython-310.pyc b/prismatic/models/backbones/vision/__pycache__/in1k_vit.cpython-310.pyc index 392ecaa636135f3e3fe5703f76c3b068cc34100e..e3c5e78cc9310eb48d72e42ab8ab9e27e6c6e060 100644 GIT binary patch delta 65 zcmX@aew>{zbHSyL_e{#BtIobw> Wraps `logging.Logger` overwatch = initialize_overwatch(__name__) diff --git a/prismatic/models/pi3/models/dinov2/__init__.py b/prismatic/models/pi3/models/dinov2/__init__.py new file mode 100644 index 0000000..ae847e4 --- /dev/null +++ b/prismatic/models/pi3/models/dinov2/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +__version__ = "0.0.1" diff --git a/prismatic/models/pi3/models/dinov2/hub/__init__.py b/prismatic/models/pi3/models/dinov2/hub/__init__.py new file mode 100644 index 0000000..b88da6b --- /dev/null +++ b/prismatic/models/pi3/models/dinov2/hub/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. diff --git a/prismatic/models/pi3/models/dinov2/hub/backbones.py b/prismatic/models/pi3/models/dinov2/hub/backbones.py new file mode 100644 index 0000000..53fe837 --- /dev/null +++ b/prismatic/models/pi3/models/dinov2/hub/backbones.py @@ -0,0 +1,156 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +from enum import Enum +from typing import Union + +import torch + +from .utils import _DINOV2_BASE_URL, _make_dinov2_model_name + + +class Weights(Enum): + LVD142M = "LVD142M" + + +def _make_dinov2_model( + *, + arch_name: str = "vit_large", + img_size: int = 518, + patch_size: int = 14, + init_values: float = 1.0, + ffn_layer: str = "mlp", + block_chunks: int = 0, + num_register_tokens: int = 0, + interpolate_antialias: bool = False, + interpolate_offset: float = 0.1, + pretrained: bool = True, + weights: Union[Weights, str] = Weights.LVD142M, + **kwargs, +): + from ..models import vision_transformer as vits + + if isinstance(weights, str): + try: + weights = Weights[weights] + except KeyError: + raise AssertionError(f"Unsupported weights: {weights}") + + model_base_name = _make_dinov2_model_name(arch_name, patch_size) + vit_kwargs = dict( + img_size=img_size, + patch_size=patch_size, + init_values=init_values, + ffn_layer=ffn_layer, + block_chunks=block_chunks, + num_register_tokens=num_register_tokens, + interpolate_antialias=interpolate_antialias, + interpolate_offset=interpolate_offset, + ) + vit_kwargs.update(**kwargs) + model = vits.__dict__[arch_name](**vit_kwargs) + + if pretrained: + model_full_name = _make_dinov2_model_name(arch_name, patch_size, num_register_tokens) + url = _DINOV2_BASE_URL + f"/{model_base_name}/{model_full_name}_pretrain.pth" + state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu") + model.load_state_dict(state_dict, strict=True) + + return model + + +def dinov2_vits14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs): + """ + DINOv2 ViT-S/14 model (optionally) pretrained on the LVD-142M dataset. + """ + return _make_dinov2_model(arch_name="vit_small", pretrained=pretrained, weights=weights, **kwargs) + + +def dinov2_vitb14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs): + """ + DINOv2 ViT-B/14 model (optionally) pretrained on the LVD-142M dataset. + """ + return _make_dinov2_model(arch_name="vit_base", pretrained=pretrained, weights=weights, **kwargs) + + +def dinov2_vitl14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs): + """ + DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M dataset. + """ + return _make_dinov2_model(arch_name="vit_large", pretrained=pretrained, weights=weights, **kwargs) + + +def dinov2_vitg14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs): + """ + DINOv2 ViT-g/14 model (optionally) pretrained on the LVD-142M dataset. + """ + return _make_dinov2_model( + arch_name="vit_giant2", + ffn_layer="swiglufused", + weights=weights, + pretrained=pretrained, + **kwargs, + ) + + +def dinov2_vits14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs): + """ + DINOv2 ViT-S/14 model with registers (optionally) pretrained on the LVD-142M dataset. + """ + return _make_dinov2_model( + arch_name="vit_small", + pretrained=pretrained, + weights=weights, + num_register_tokens=4, + interpolate_antialias=True, + interpolate_offset=0.0, + **kwargs, + ) + + +def dinov2_vitb14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs): + """ + DINOv2 ViT-B/14 model with registers (optionally) pretrained on the LVD-142M dataset. + """ + return _make_dinov2_model( + arch_name="vit_base", + pretrained=pretrained, + weights=weights, + num_register_tokens=4, + interpolate_antialias=True, + interpolate_offset=0.0, + **kwargs, + ) + + +def dinov2_vitl14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs): + """ + DINOv2 ViT-L/14 model with registers (optionally) pretrained on the LVD-142M dataset. + """ + return _make_dinov2_model( + arch_name="vit_large", + pretrained=pretrained, + weights=weights, + num_register_tokens=4, + interpolate_antialias=True, + interpolate_offset=0.0, + **kwargs, + ) + + +def dinov2_vitg14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs): + """ + DINOv2 ViT-g/14 model with registers (optionally) pretrained on the LVD-142M dataset. + """ + return _make_dinov2_model( + arch_name="vit_giant2", + ffn_layer="swiglufused", + weights=weights, + pretrained=pretrained, + num_register_tokens=4, + interpolate_antialias=True, + interpolate_offset=0.0, + **kwargs, + ) diff --git a/prismatic/models/pi3/models/dinov2/hub/utils.py b/prismatic/models/pi3/models/dinov2/hub/utils.py new file mode 100644 index 0000000..9c66414 --- /dev/null +++ b/prismatic/models/pi3/models/dinov2/hub/utils.py @@ -0,0 +1,39 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +import itertools +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +_DINOV2_BASE_URL = "https://dl.fbaipublicfiles.com/dinov2" + + +def _make_dinov2_model_name(arch_name: str, patch_size: int, num_register_tokens: int = 0) -> str: + compact_arch_name = arch_name.replace("_", "")[:4] + registers_suffix = f"_reg{num_register_tokens}" if num_register_tokens else "" + return f"dinov2_{compact_arch_name}{patch_size}{registers_suffix}" + + +class CenterPadding(nn.Module): + def __init__(self, multiple): + super().__init__() + self.multiple = multiple + + def _get_pad(self, size): + new_size = math.ceil(size / self.multiple) * self.multiple + pad_size = new_size - size + pad_size_left = pad_size // 2 + pad_size_right = pad_size - pad_size_left + return pad_size_left, pad_size_right + + @torch.inference_mode() + def forward(self, x): + pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in x.shape[:1:-1])) + output = F.pad(x, pads) + return output diff --git a/prismatic/models/pi3/models/dinov2/layers/__init__.py b/prismatic/models/pi3/models/dinov2/layers/__init__.py new file mode 100644 index 0000000..05a0b61 --- /dev/null +++ b/prismatic/models/pi3/models/dinov2/layers/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +from .dino_head import DINOHead +from .mlp import Mlp +from .patch_embed import PatchEmbed +from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused +from .block import NestedTensorBlock +from .attention import MemEffAttention diff --git a/prismatic/models/pi3/models/dinov2/layers/attention.py b/prismatic/models/pi3/models/dinov2/layers/attention.py new file mode 100644 index 0000000..3fed573 --- /dev/null +++ b/prismatic/models/pi3/models/dinov2/layers/attention.py @@ -0,0 +1,89 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py + +import logging +import os +import warnings + +from torch import Tensor +from torch import nn + + +logger = logging.getLogger("dinov2") + + +XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None +try: + if XFORMERS_ENABLED: + from xformers.ops import memory_efficient_attention, unbind + + XFORMERS_AVAILABLE = True + # warnings.warn("xFormers is available (Attention)") + else: + # warnings.warn("xFormers is disabled (Attention)") + raise ImportError +except ImportError: + XFORMERS_AVAILABLE = False + # warnings.warn("xFormers is not available (Attention)") + + +class Attention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + proj_bias: bool = True, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + ) -> None: + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim, bias=proj_bias) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x: Tensor, attn_bias=None) -> Tensor: + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + + q, k, v = qkv[0] * self.scale, qkv[1], qkv[2] + attn = q @ k.transpose(-2, -1) + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class MemEffAttention(Attention): + def forward(self, x: Tensor, attn_bias=None) -> Tensor: + if not XFORMERS_AVAILABLE: + if attn_bias is not None: + raise AssertionError("xFormers is required for using nested tensors") + return super().forward(x) + + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads) + + q, k, v = unbind(qkv, 2) + + x = memory_efficient_attention(q, k, v, attn_bias=attn_bias) + x = x.reshape([B, N, C]) + + x = self.proj(x) + x = self.proj_drop(x) + return x diff --git a/prismatic/models/pi3/models/dinov2/layers/block.py b/prismatic/models/pi3/models/dinov2/layers/block.py new file mode 100644 index 0000000..fd5b8a7 --- /dev/null +++ b/prismatic/models/pi3/models/dinov2/layers/block.py @@ -0,0 +1,259 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py + +import logging +import os +from typing import Callable, List, Any, Tuple, Dict +import warnings + +import torch +from torch import nn, Tensor + +from .attention import Attention, MemEffAttention +from .drop_path import DropPath +from .layer_scale import LayerScale +from .mlp import Mlp + + +logger = logging.getLogger("dinov2") + + +XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None +try: + if XFORMERS_ENABLED: + from xformers.ops import fmha, scaled_index_add, index_select_cat + + XFORMERS_AVAILABLE = True + # warnings.warn("xFormers is available (Block)") + else: + # warnings.warn("xFormers is disabled (Block)") + raise ImportError +except ImportError: + XFORMERS_AVAILABLE = False + # warnings.warn("xFormers is not available (Block)") + + +class Block(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + mlp_ratio: float = 4.0, + qkv_bias: bool = False, + proj_bias: bool = True, + ffn_bias: bool = True, + drop: float = 0.0, + attn_drop: float = 0.0, + init_values=None, + drop_path: float = 0.0, + act_layer: Callable[..., nn.Module] = nn.GELU, + norm_layer: Callable[..., nn.Module] = nn.LayerNorm, + attn_class: Callable[..., nn.Module] = Attention, + ffn_layer: Callable[..., nn.Module] = Mlp, + ) -> None: + super().__init__() + # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}") + self.norm1 = norm_layer(dim) + self.attn = attn_class( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + proj_bias=proj_bias, + attn_drop=attn_drop, + proj_drop=drop, + ) + self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = ffn_layer( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + bias=ffn_bias, + ) + self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + self.sample_drop_ratio = drop_path + + def forward(self, x: Tensor) -> Tensor: + def attn_residual_func(x: Tensor) -> Tensor: + return self.ls1(self.attn(self.norm1(x))) + + def ffn_residual_func(x: Tensor) -> Tensor: + return self.ls2(self.mlp(self.norm2(x))) + + if self.training and self.sample_drop_ratio > 0.1: + # the overhead is compensated only for a drop path rate larger than 0.1 + x = drop_add_residual_stochastic_depth( + x, + residual_func=attn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + ) + x = drop_add_residual_stochastic_depth( + x, + residual_func=ffn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + ) + elif self.training and self.sample_drop_ratio > 0.0: + x = x + self.drop_path1(attn_residual_func(x)) + x = x + self.drop_path1(ffn_residual_func(x)) # FIXME: drop_path2 + else: + x = x + attn_residual_func(x) + x = x + ffn_residual_func(x) + return x + + +def drop_add_residual_stochastic_depth( + x: Tensor, + residual_func: Callable[[Tensor], Tensor], + sample_drop_ratio: float = 0.0, +) -> Tensor: + # 1) extract subset using permutation + b, n, d = x.shape + sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1) + brange = (torch.randperm(b, device=x.device))[:sample_subset_size] + x_subset = x[brange] + + # 2) apply residual_func to get residual + residual = residual_func(x_subset) + + x_flat = x.flatten(1) + residual = residual.flatten(1) + + residual_scale_factor = b / sample_subset_size + + # 3) add the residual + x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor) + return x_plus_residual.view_as(x) + + +def get_branges_scales(x, sample_drop_ratio=0.0): + b, n, d = x.shape + sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1) + brange = (torch.randperm(b, device=x.device))[:sample_subset_size] + residual_scale_factor = b / sample_subset_size + return brange, residual_scale_factor + + +def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None): + if scaling_vector is None: + x_flat = x.flatten(1) + residual = residual.flatten(1) + x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor) + else: + x_plus_residual = scaled_index_add( + x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor + ) + return x_plus_residual + + +attn_bias_cache: Dict[Tuple, Any] = {} + + +def get_attn_bias_and_cat(x_list, branges=None): + """ + this will perform the index select, cat the tensors, and provide the attn_bias from cache + """ + batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list] + all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list)) + if all_shapes not in attn_bias_cache.keys(): + seqlens = [] + for b, x in zip(batch_sizes, x_list): + for _ in range(b): + seqlens.append(x.shape[1]) + attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens) + attn_bias._batch_sizes = batch_sizes + attn_bias_cache[all_shapes] = attn_bias + + if branges is not None: + cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1]) + else: + tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list) + cat_tensors = torch.cat(tensors_bs1, dim=1) + + return attn_bias_cache[all_shapes], cat_tensors + + +def drop_add_residual_stochastic_depth_list( + x_list: List[Tensor], + residual_func: Callable[[Tensor, Any], Tensor], + sample_drop_ratio: float = 0.0, + scaling_vector=None, +) -> Tensor: + # 1) generate random set of indices for dropping samples in the batch + branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list] + branges = [s[0] for s in branges_scales] + residual_scale_factors = [s[1] for s in branges_scales] + + # 2) get attention bias and index+concat the tensors + attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges) + + # 3) apply residual_func to get residual, and split the result + residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias)) # type: ignore + + outputs = [] + for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors): + outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x)) + return outputs + + +class NestedTensorBlock(Block): + def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]: + """ + x_list contains a list of tensors to nest together and run + """ + assert isinstance(self.attn, MemEffAttention) + + if self.training and self.sample_drop_ratio > 0.0: + + def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.attn(self.norm1(x), attn_bias=attn_bias) + + def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.mlp(self.norm2(x)) + + x_list = drop_add_residual_stochastic_depth_list( + x_list, + residual_func=attn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None, + ) + x_list = drop_add_residual_stochastic_depth_list( + x_list, + residual_func=ffn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None, + ) + return x_list + else: + + def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias)) + + def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.ls2(self.mlp(self.norm2(x))) + + attn_bias, x = get_attn_bias_and_cat(x_list) + x = x + attn_residual_func(x, attn_bias=attn_bias) + x = x + ffn_residual_func(x) + return attn_bias.split(x) + + def forward(self, x_or_x_list): + if isinstance(x_or_x_list, Tensor): + return super().forward(x_or_x_list) + elif isinstance(x_or_x_list, list): + if not XFORMERS_AVAILABLE: + raise AssertionError("xFormers is required for using nested tensors") + return self.forward_nested(x_or_x_list) + else: + raise AssertionError diff --git a/prismatic/models/pi3/models/dinov2/layers/dino_head.py b/prismatic/models/pi3/models/dinov2/layers/dino_head.py new file mode 100644 index 0000000..0ace8ff --- /dev/null +++ b/prismatic/models/pi3/models/dinov2/layers/dino_head.py @@ -0,0 +1,58 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn +from torch.nn.init import trunc_normal_ +from torch.nn.utils import weight_norm + + +class DINOHead(nn.Module): + def __init__( + self, + in_dim, + out_dim, + use_bn=False, + nlayers=3, + hidden_dim=2048, + bottleneck_dim=256, + mlp_bias=True, + ): + super().__init__() + nlayers = max(nlayers, 1) + self.mlp = _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=hidden_dim, use_bn=use_bn, bias=mlp_bias) + self.apply(self._init_weights) + self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False)) + self.last_layer.weight_g.data.fill_(1) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + + def forward(self, x): + x = self.mlp(x) + eps = 1e-6 if x.dtype == torch.float16 else 1e-12 + x = nn.functional.normalize(x, dim=-1, p=2, eps=eps) + x = self.last_layer(x) + return x + + +def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True): + if nlayers == 1: + return nn.Linear(in_dim, bottleneck_dim, bias=bias) + else: + layers = [nn.Linear(in_dim, hidden_dim, bias=bias)] + if use_bn: + layers.append(nn.BatchNorm1d(hidden_dim)) + layers.append(nn.GELU()) + for _ in range(nlayers - 2): + layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias)) + if use_bn: + layers.append(nn.BatchNorm1d(hidden_dim)) + layers.append(nn.GELU()) + layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias)) + return nn.Sequential(*layers) diff --git a/prismatic/models/pi3/models/dinov2/layers/drop_path.py b/prismatic/models/pi3/models/dinov2/layers/drop_path.py new file mode 100644 index 0000000..1d640e0 --- /dev/null +++ b/prismatic/models/pi3/models/dinov2/layers/drop_path.py @@ -0,0 +1,34 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py + + +from torch import nn + + +def drop_path(x, drop_prob: float = 0.0, training: bool = False): + if drop_prob == 0.0 or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = x.new_empty(shape).bernoulli_(keep_prob) + if keep_prob > 0.0: + random_tensor.div_(keep_prob) + output = x * random_tensor + return output + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) diff --git a/prismatic/models/pi3/models/dinov2/layers/layer_scale.py b/prismatic/models/pi3/models/dinov2/layers/layer_scale.py new file mode 100644 index 0000000..51df0d7 --- /dev/null +++ b/prismatic/models/pi3/models/dinov2/layers/layer_scale.py @@ -0,0 +1,27 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110 + +from typing import Union + +import torch +from torch import Tensor +from torch import nn + + +class LayerScale(nn.Module): + def __init__( + self, + dim: int, + init_values: Union[float, Tensor] = 1e-5, + inplace: bool = False, + ) -> None: + super().__init__() + self.inplace = inplace + self.gamma = nn.Parameter(init_values * torch.ones(dim)) + + def forward(self, x: Tensor) -> Tensor: + return x.mul_(self.gamma) if self.inplace else x * self.gamma diff --git a/prismatic/models/pi3/models/dinov2/layers/mlp.py b/prismatic/models/pi3/models/dinov2/layers/mlp.py new file mode 100644 index 0000000..bbf9432 --- /dev/null +++ b/prismatic/models/pi3/models/dinov2/layers/mlp.py @@ -0,0 +1,40 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py + + +from typing import Callable, Optional + +from torch import Tensor, nn + + +class Mlp(nn.Module): + def __init__( + self, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + act_layer: Callable[..., nn.Module] = nn.GELU, + drop: float = 0.0, + bias: bool = True, + ) -> None: + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features, bias=bias) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features, bias=bias) + self.drop = nn.Dropout(drop) + + def forward(self, x: Tensor) -> Tensor: + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x diff --git a/prismatic/models/pi3/models/dinov2/layers/patch_embed.py b/prismatic/models/pi3/models/dinov2/layers/patch_embed.py new file mode 100644 index 0000000..8b7c080 --- /dev/null +++ b/prismatic/models/pi3/models/dinov2/layers/patch_embed.py @@ -0,0 +1,88 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py + +from typing import Callable, Optional, Tuple, Union + +from torch import Tensor +import torch.nn as nn + + +def make_2tuple(x): + if isinstance(x, tuple): + assert len(x) == 2 + return x + + assert isinstance(x, int) + return (x, x) + + +class PatchEmbed(nn.Module): + """ + 2D image to patch embedding: (B,C,H,W) -> (B,N,D) + + Args: + img_size: Image size. + patch_size: Patch token size. + in_chans: Number of input image channels. + embed_dim: Number of linear projection output channels. + norm_layer: Normalization layer. + """ + + def __init__( + self, + img_size: Union[int, Tuple[int, int]] = 224, + patch_size: Union[int, Tuple[int, int]] = 16, + in_chans: int = 3, + embed_dim: int = 768, + norm_layer: Optional[Callable] = None, + flatten_embedding: bool = True, + ) -> None: + super().__init__() + + image_HW = make_2tuple(img_size) + patch_HW = make_2tuple(patch_size) + patch_grid_size = ( + image_HW[0] // patch_HW[0], + image_HW[1] // patch_HW[1], + ) + + self.img_size = image_HW + self.patch_size = patch_HW + self.patches_resolution = patch_grid_size + self.num_patches = patch_grid_size[0] * patch_grid_size[1] + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.flatten_embedding = flatten_embedding + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW) + self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() + + def forward(self, x: Tensor) -> Tensor: + _, _, H, W = x.shape + patch_H, patch_W = self.patch_size + + assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}" + assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}" + + x = self.proj(x) # B C H W + H, W = x.size(2), x.size(3) + x = x.flatten(2).transpose(1, 2) # B HW C + x = self.norm(x) + if not self.flatten_embedding: + x = x.reshape(-1, H, W, self.embed_dim) # B H W C + return x + + def flops(self) -> float: + Ho, Wo = self.patches_resolution + flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1]) + if self.norm is not None: + flops += Ho * Wo * self.embed_dim + return flops diff --git a/prismatic/models/pi3/models/dinov2/layers/swiglu_ffn.py b/prismatic/models/pi3/models/dinov2/layers/swiglu_ffn.py new file mode 100644 index 0000000..5ce2115 --- /dev/null +++ b/prismatic/models/pi3/models/dinov2/layers/swiglu_ffn.py @@ -0,0 +1,72 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +import os +from typing import Callable, Optional +import warnings + +from torch import Tensor, nn +import torch.nn.functional as F + + +class SwiGLUFFN(nn.Module): + def __init__( + self, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + act_layer: Callable[..., nn.Module] = None, + drop: float = 0.0, + bias: bool = True, + ) -> None: + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias) + self.w3 = nn.Linear(hidden_features, out_features, bias=bias) + + def forward(self, x: Tensor) -> Tensor: + x12 = self.w12(x) + x1, x2 = x12.chunk(2, dim=-1) + hidden = F.silu(x1) * x2 + return self.w3(hidden) + + +XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None +try: + if XFORMERS_ENABLED: + from xformers.ops import SwiGLU + + XFORMERS_AVAILABLE = True + # warnings.warn("xFormers is available (SwiGLU)") + else: + # warnings.warn("xFormers is disabled (SwiGLU)") + raise ImportError +except ImportError: + SwiGLU = SwiGLUFFN + XFORMERS_AVAILABLE = False + + # warnings.warn("xFormers is not available (SwiGLU)") + + +class SwiGLUFFNFused(SwiGLU): + def __init__( + self, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + act_layer: Callable[..., nn.Module] = None, + drop: float = 0.0, + bias: bool = True, + ) -> None: + out_features = out_features or in_features + hidden_features = hidden_features or in_features + hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8 + super().__init__( + in_features=in_features, + hidden_features=hidden_features, + out_features=out_features, + bias=bias, + ) diff --git a/prismatic/models/pi3/models/dinov2/models/__init__.py b/prismatic/models/pi3/models/dinov2/models/__init__.py new file mode 100644 index 0000000..3fdff20 --- /dev/null +++ b/prismatic/models/pi3/models/dinov2/models/__init__.py @@ -0,0 +1,43 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +import logging + +from . import vision_transformer as vits + + +logger = logging.getLogger("dinov2") + + +def build_model(args, only_teacher=False, img_size=224): + args.arch = args.arch.removesuffix("_memeff") + if "vit" in args.arch: + vit_kwargs = dict( + img_size=img_size, + patch_size=args.patch_size, + init_values=args.layerscale, + ffn_layer=args.ffn_layer, + block_chunks=args.block_chunks, + qkv_bias=args.qkv_bias, + proj_bias=args.proj_bias, + ffn_bias=args.ffn_bias, + num_register_tokens=args.num_register_tokens, + interpolate_offset=args.interpolate_offset, + interpolate_antialias=args.interpolate_antialias, + ) + teacher = vits.__dict__[args.arch](**vit_kwargs) + if only_teacher: + return teacher, teacher.embed_dim + student = vits.__dict__[args.arch]( + **vit_kwargs, + drop_path_rate=args.drop_path_rate, + drop_path_uniform=args.drop_path_uniform, + ) + embed_dim = student.embed_dim + return student, teacher, embed_dim + + +def build_model_from_cfg(cfg, only_teacher=False): + return build_model(cfg.student, only_teacher=only_teacher, img_size=cfg.crops.global_crops_size) diff --git a/prismatic/models/pi3/models/dinov2/models/vision_transformer.py b/prismatic/models/pi3/models/dinov2/models/vision_transformer.py new file mode 100644 index 0000000..73f15cf --- /dev/null +++ b/prismatic/models/pi3/models/dinov2/models/vision_transformer.py @@ -0,0 +1,404 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/main/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py + +from functools import partial +import math +import logging +from typing import Sequence, Tuple, Union, Callable + +import torch +import torch.nn as nn +from torch.utils.checkpoint import checkpoint +from torch.nn.init import trunc_normal_ + +from ..layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block +from ...layers.attention import FlashAttention + + +# logger = logging.getLogger("dinov2") + + +def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module: + if not depth_first and include_root: + fn(module=module, name=name) + for child_name, child_module in module.named_children(): + child_name = ".".join((name, child_name)) if name else child_name + named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True) + if depth_first and include_root: + fn(module=module, name=name) + return module + + +class BlockChunk(nn.ModuleList): + def forward(self, x): + for b in self: + x = b(x) + return x + + +class DinoVisionTransformer(nn.Module): + def __init__( + self, + img_size=224, + patch_size=16, + in_chans=3, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4.0, + qkv_bias=True, + ffn_bias=True, + proj_bias=True, + drop_path_rate=0.0, + drop_path_uniform=False, + init_values=None, # for layerscale: None or 0 => no layerscale + embed_layer=PatchEmbed, + act_layer=nn.GELU, + block_fn=Block, + ffn_layer="mlp", + block_chunks=1, + num_register_tokens=0, + interpolate_antialias=False, + interpolate_offset=0.1, + ): + """ + Args: + img_size (int, tuple): input image size + patch_size (int, tuple): patch size + in_chans (int): number of input channels + embed_dim (int): embedding dimension + depth (int): depth of transformer + num_heads (int): number of attention heads + mlp_ratio (int): ratio of mlp hidden dim to embedding dim + qkv_bias (bool): enable bias for qkv if True + proj_bias (bool): enable bias for proj in attn if True + ffn_bias (bool): enable bias for ffn if True + drop_path_rate (float): stochastic depth rate + drop_path_uniform (bool): apply uniform drop rate across blocks + weight_init (str): weight init scheme + init_values (float): layer-scale init values + embed_layer (nn.Module): patch embedding layer + act_layer (nn.Module): MLP activation layer + block_fn (nn.Module): transformer block class + ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity" + block_chunks: (int) split block sequence into block_chunks units for FSDP wrap + num_register_tokens: (int) number of extra cls tokens (so-called "registers") + interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings + interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings + """ + super().__init__() + norm_layer = partial(nn.LayerNorm, eps=1e-6) + + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + self.num_tokens = 1 + self.n_blocks = depth + self.num_heads = num_heads + self.patch_size = patch_size + self.num_register_tokens = num_register_tokens + self.interpolate_antialias = interpolate_antialias + self.interpolate_offset = interpolate_offset + + self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) + num_patches = self.patch_embed.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim)) + assert num_register_tokens >= 0 + self.register_tokens = ( + nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None + ) + + if drop_path_uniform is True: + dpr = [drop_path_rate] * depth + else: + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + + if ffn_layer == "mlp": + # logger.info("using MLP layer as FFN") + ffn_layer = Mlp + elif ffn_layer == "swiglufused" or ffn_layer == "swiglu": + # logger.info("using SwiGLU layer as FFN") + ffn_layer = SwiGLUFFNFused + elif ffn_layer == "identity": + # logger.info("using Identity layer as FFN") + + def f(*args, **kwargs): + return nn.Identity() + + ffn_layer = f + else: + raise NotImplementedError + + blocks_list = [ + block_fn( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + proj_bias=proj_bias, + ffn_bias=ffn_bias, + drop_path=dpr[i], + norm_layer=norm_layer, + act_layer=act_layer, + ffn_layer=ffn_layer, + init_values=init_values, + attn_class=FlashAttention + ) + for i in range(depth) + ] + if block_chunks > 0: + self.chunked_blocks = True + chunked_blocks = [] + chunksize = depth // block_chunks + for i in range(0, depth, chunksize): + # this is to keep the block index consistent if we chunk the block list + chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize]) + self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks]) + else: + self.chunked_blocks = False + self.blocks = nn.ModuleList(blocks_list) + + self.norm = norm_layer(embed_dim) + self.head = nn.Identity() + + self.mask_token = nn.Parameter(torch.zeros(1, embed_dim)) + + self.init_weights() + + def init_weights(self): + trunc_normal_(self.pos_embed, std=0.02) + nn.init.normal_(self.cls_token, std=1e-6) + if self.register_tokens is not None: + nn.init.normal_(self.register_tokens, std=1e-6) + named_apply(init_weights_vit_timm, self) + + def interpolate_pos_encoding(self, x, w, h): + previous_dtype = x.dtype + npatch = x.shape[1] - 1 + N = self.pos_embed.shape[1] - 1 + if npatch == N and w == h: + return self.pos_embed + pos_embed = self.pos_embed.float() + class_pos_embed = pos_embed[:, 0] + patch_pos_embed = pos_embed[:, 1:] + dim = x.shape[-1] + w0 = w // self.patch_size + h0 = h // self.patch_size + M = int(math.sqrt(N)) # Recover the number of patches in each dimension + assert N == M * M + kwargs = {} + if self.interpolate_offset: + # Historical kludge: add a small number to avoid floating point error in the interpolation, see https://github.com/facebookresearch/dino/issues/8 + # Note: still needed for backward-compatibility, the underlying operators are using both output size and scale factors + sx = float(w0 + self.interpolate_offset) / M + sy = float(h0 + self.interpolate_offset) / M + kwargs["scale_factor"] = (sx, sy) + else: + # Simply specify an output size instead of a scale factor + kwargs["size"] = (w0, h0) + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed.reshape(1, M, M, dim).permute(0, 3, 1, 2), + mode="bicubic", + antialias=self.interpolate_antialias, + **kwargs, + ) + assert (w0, h0) == patch_pos_embed.shape[-2:] + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype) + + def prepare_tokens_with_masks(self, x, masks=None): + B, nc, w, h = x.shape + x = self.patch_embed(x) + if masks is not None: + x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x) + + x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1) + x = x + self.interpolate_pos_encoding(x, w, h) + + if self.register_tokens is not None: + x = torch.cat( + ( + x[:, :1], + self.register_tokens.expand(x.shape[0], -1, -1), + x[:, 1:], + ), + dim=1, + ) + + return x + + def forward_features_list(self, x_list, masks_list): + x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)] + for blk in self.blocks: + if self.training: + x = checkpoint(blk, x, use_reentrant=False) + else: + x = blk(x) + + all_x = x + output = [] + for x, masks in zip(all_x, masks_list): + x_norm = self.norm(x) + output.append( + { + "x_norm_clstoken": x_norm[:, 0], + "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1], + "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :], + "x_prenorm": x, + "masks": masks, + } + ) + return output + + def forward_features(self, x, masks=None): + if isinstance(x, list): + return self.forward_features_list(x, masks) + + x = self.prepare_tokens_with_masks(x, masks) + + for blk in self.blocks: + if self.training: + x = checkpoint(blk, x, use_reentrant=False) + else: + x = blk(x) + + x_norm = self.norm(x) + return { + "x_norm_clstoken": x_norm[:, 0], + "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1], + "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :], + "x_prenorm": x, + "masks": masks, + } + + def _get_intermediate_layers_not_chunked(self, x, n=1): + x = self.prepare_tokens_with_masks(x) + # If n is an int, take the n last blocks. If it's a list, take them + output, total_block_len = [], len(self.blocks) + blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n + for i, blk in enumerate(self.blocks): + x = blk(x) + if i in blocks_to_take: + output.append(x) + assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found" + return output + + def _get_intermediate_layers_chunked(self, x, n=1): + x = self.prepare_tokens_with_masks(x) + output, i, total_block_len = [], 0, len(self.blocks[-1]) + # If n is an int, take the n last blocks. If it's a list, take them + blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n + for block_chunk in self.blocks: + for blk in block_chunk[i:]: # Passing the nn.Identity() + x = blk(x) + if i in blocks_to_take: + output.append(x) + i += 1 + assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found" + return output + + def get_intermediate_layers( + self, + x: torch.Tensor, + n: Union[int, Sequence] = 1, # Layers or n last layers to take + reshape: bool = False, + return_class_token: bool = False, + norm=True, + ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]: + if self.chunked_blocks: + outputs = self._get_intermediate_layers_chunked(x, n) + else: + outputs = self._get_intermediate_layers_not_chunked(x, n) + if norm: + outputs = [self.norm(out) for out in outputs] + class_tokens = [out[:, 0] for out in outputs] + outputs = [out[:, 1 + self.num_register_tokens :] for out in outputs] + if reshape: + B, _, w, h = x.shape + outputs = [ + out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous() + for out in outputs + ] + if return_class_token: + return tuple(zip(outputs, class_tokens)) + return tuple(outputs) + + def forward(self, *args, is_training=False, **kwargs): + ret = self.forward_features(*args, **kwargs) + if is_training: + return ret + else: + return self.head(ret["x_norm_clstoken"]) + + +def init_weights_vit_timm(module: nn.Module, name: str = ""): + """ViT weight initialization, original timm impl (for reproducibility)""" + if isinstance(module, nn.Linear): + trunc_normal_(module.weight, std=0.02) + if module.bias is not None: + nn.init.zeros_(module.bias) + + +def vit_small(patch_size=16, num_register_tokens=0, **kwargs): + model = DinoVisionTransformer( + patch_size=patch_size, + embed_dim=384, + depth=12, + num_heads=6, + mlp_ratio=4, + block_fn=partial(Block, attn_class=MemEffAttention), + num_register_tokens=num_register_tokens, + **kwargs, + ) + return model + + +def vit_base(patch_size=16, num_register_tokens=0, **kwargs): + model = DinoVisionTransformer( + patch_size=patch_size, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + block_fn=partial(Block, attn_class=MemEffAttention), + num_register_tokens=num_register_tokens, + **kwargs, + ) + return model + + +def vit_large(patch_size=16, num_register_tokens=0, **kwargs): + model = DinoVisionTransformer( + patch_size=patch_size, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4, + block_fn=partial(Block, attn_class=MemEffAttention), + num_register_tokens=num_register_tokens, + **kwargs, + ) + return model + + +def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs): + """ + Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64 + """ + model = DinoVisionTransformer( + patch_size=patch_size, + embed_dim=1536, + depth=40, + num_heads=24, + mlp_ratio=4, + block_fn=partial(Block, attn_class=MemEffAttention), + num_register_tokens=num_register_tokens, + **kwargs, + ) + return model diff --git a/prismatic/models/pi3/models/dinov2/utils/__init__.py b/prismatic/models/pi3/models/dinov2/utils/__init__.py new file mode 100644 index 0000000..b88da6b --- /dev/null +++ b/prismatic/models/pi3/models/dinov2/utils/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. diff --git a/prismatic/models/pi3/models/dinov2/utils/cluster.py b/prismatic/models/pi3/models/dinov2/utils/cluster.py new file mode 100644 index 0000000..3df87dc --- /dev/null +++ b/prismatic/models/pi3/models/dinov2/utils/cluster.py @@ -0,0 +1,95 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +from enum import Enum +import os +from pathlib import Path +from typing import Any, Dict, Optional + + +class ClusterType(Enum): + AWS = "aws" + FAIR = "fair" + RSC = "rsc" + + +def _guess_cluster_type() -> ClusterType: + uname = os.uname() + if uname.sysname == "Linux": + if uname.release.endswith("-aws"): + # Linux kernel versions on AWS instances are of the form "5.4.0-1051-aws" + return ClusterType.AWS + elif uname.nodename.startswith("rsc"): + # Linux kernel versions on RSC instances are standard ones but hostnames start with "rsc" + return ClusterType.RSC + + return ClusterType.FAIR + + +def get_cluster_type(cluster_type: Optional[ClusterType] = None) -> Optional[ClusterType]: + if cluster_type is None: + return _guess_cluster_type() + + return cluster_type + + +def get_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]: + cluster_type = get_cluster_type(cluster_type) + if cluster_type is None: + return None + + CHECKPOINT_DIRNAMES = { + ClusterType.AWS: "checkpoints", + ClusterType.FAIR: "checkpoint", + ClusterType.RSC: "checkpoint/dino", + } + return Path("/") / CHECKPOINT_DIRNAMES[cluster_type] + + +def get_user_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]: + checkpoint_path = get_checkpoint_path(cluster_type) + if checkpoint_path is None: + return None + + username = os.environ.get("USER") + assert username is not None + return checkpoint_path / username + + +def get_slurm_partition(cluster_type: Optional[ClusterType] = None) -> Optional[str]: + cluster_type = get_cluster_type(cluster_type) + if cluster_type is None: + return None + + SLURM_PARTITIONS = { + ClusterType.AWS: "learnlab", + ClusterType.FAIR: "learnlab", + ClusterType.RSC: "learn", + } + return SLURM_PARTITIONS[cluster_type] + + +def get_slurm_executor_parameters( + nodes: int, num_gpus_per_node: int, cluster_type: Optional[ClusterType] = None, **kwargs +) -> Dict[str, Any]: + # create default parameters + params = { + "mem_gb": 0, # Requests all memory on a node, see https://slurm.schedmd.com/sbatch.html + "gpus_per_node": num_gpus_per_node, + "tasks_per_node": num_gpus_per_node, # one task per GPU + "cpus_per_task": 10, + "nodes": nodes, + "slurm_partition": get_slurm_partition(cluster_type), + } + # apply cluster-specific adjustments + cluster_type = get_cluster_type(cluster_type) + if cluster_type == ClusterType.AWS: + params["cpus_per_task"] = 12 + del params["mem_gb"] + elif cluster_type == ClusterType.RSC: + params["cpus_per_task"] = 12 + # set additional parameters / apply overrides + params.update(kwargs) + return params diff --git a/prismatic/models/pi3/models/dinov2/utils/config.py b/prismatic/models/pi3/models/dinov2/utils/config.py new file mode 100644 index 0000000..c9de578 --- /dev/null +++ b/prismatic/models/pi3/models/dinov2/utils/config.py @@ -0,0 +1,72 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +import math +import logging +import os + +from omegaconf import OmegaConf + +import dinov2.distributed as distributed +from dinov2.logging import setup_logging +from dinov2.utils import utils +from dinov2.configs import dinov2_default_config + + +logger = logging.getLogger("dinov2") + + +def apply_scaling_rules_to_cfg(cfg): # to fix + if cfg.optim.scaling_rule == "sqrt_wrt_1024": + base_lr = cfg.optim.base_lr + cfg.optim.lr = base_lr + cfg.optim.lr *= math.sqrt(cfg.train.batch_size_per_gpu * distributed.get_global_size() / 1024.0) + logger.info(f"sqrt scaling learning rate; base: {base_lr}, new: {cfg.optim.lr}") + else: + raise NotImplementedError + return cfg + + +def write_config(cfg, output_dir, name="config.yaml"): + logger.info(OmegaConf.to_yaml(cfg)) + saved_cfg_path = os.path.join(output_dir, name) + with open(saved_cfg_path, "w") as f: + OmegaConf.save(config=cfg, f=f) + return saved_cfg_path + + +def get_cfg_from_args(args): + args.output_dir = os.path.abspath(args.output_dir) + args.opts += [f"train.output_dir={args.output_dir}"] + default_cfg = OmegaConf.create(dinov2_default_config) + cfg = OmegaConf.load(args.config_file) + cfg = OmegaConf.merge(default_cfg, cfg, OmegaConf.from_cli(args.opts)) + return cfg + + +def default_setup(args): + distributed.enable(overwrite=True) + seed = getattr(args, "seed", 0) + rank = distributed.get_global_rank() + + global logger + setup_logging(output=args.output_dir, level=logging.INFO) + logger = logging.getLogger("dinov2") + + utils.fix_random_seeds(seed + rank) + logger.info("git:\n {}\n".format(utils.get_sha())) + logger.info("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items()))) + + +def setup(args): + """ + Create configs and perform basic setups. + """ + cfg = get_cfg_from_args(args) + os.makedirs(args.output_dir, exist_ok=True) + default_setup(args) + apply_scaling_rules_to_cfg(cfg) + write_config(cfg, args.output_dir) + return cfg diff --git a/prismatic/models/pi3/models/dinov2/utils/dtype.py b/prismatic/models/pi3/models/dinov2/utils/dtype.py new file mode 100644 index 0000000..80f4cd7 --- /dev/null +++ b/prismatic/models/pi3/models/dinov2/utils/dtype.py @@ -0,0 +1,37 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + + +from typing import Dict, Union + +import numpy as np +import torch + + +TypeSpec = Union[str, np.dtype, torch.dtype] + + +_NUMPY_TO_TORCH_DTYPE: Dict[np.dtype, torch.dtype] = { + np.dtype("bool"): torch.bool, + np.dtype("uint8"): torch.uint8, + np.dtype("int8"): torch.int8, + np.dtype("int16"): torch.int16, + np.dtype("int32"): torch.int32, + np.dtype("int64"): torch.int64, + np.dtype("float16"): torch.float16, + np.dtype("float32"): torch.float32, + np.dtype("float64"): torch.float64, + np.dtype("complex64"): torch.complex64, + np.dtype("complex128"): torch.complex128, +} + + +def as_torch_dtype(dtype: TypeSpec) -> torch.dtype: + if isinstance(dtype, torch.dtype): + return dtype + if isinstance(dtype, str): + dtype = np.dtype(dtype) + assert isinstance(dtype, np.dtype), f"Expected an instance of nunpy dtype, got {type(dtype)}" + return _NUMPY_TO_TORCH_DTYPE[dtype] diff --git a/prismatic/models/pi3/models/dinov2/utils/param_groups.py b/prismatic/models/pi3/models/dinov2/utils/param_groups.py new file mode 100644 index 0000000..9a5d2ff --- /dev/null +++ b/prismatic/models/pi3/models/dinov2/utils/param_groups.py @@ -0,0 +1,103 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +from collections import defaultdict +import logging + + +logger = logging.getLogger("dinov2") + + +def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12, force_is_backbone=False, chunked_blocks=False): + """ + Calculate lr decay rate for different ViT blocks. + Args: + name (string): parameter name. + lr_decay_rate (float): base lr decay rate. + num_layers (int): number of ViT blocks. + Returns: + lr decay rate for the given parameter. + """ + layer_id = num_layers + 1 + if name.startswith("backbone") or force_is_backbone: + if ( + ".pos_embed" in name + or ".patch_embed" in name + or ".mask_token" in name + or ".cls_token" in name + or ".register_tokens" in name + ): + layer_id = 0 + elif force_is_backbone and ( + "pos_embed" in name + or "patch_embed" in name + or "mask_token" in name + or "cls_token" in name + or "register_tokens" in name + ): + layer_id = 0 + elif ".blocks." in name and ".residual." not in name: + layer_id = int(name[name.find(".blocks.") :].split(".")[2]) + 1 + elif chunked_blocks and "blocks." in name and "residual." not in name: + layer_id = int(name[name.find("blocks.") :].split(".")[2]) + 1 + elif "blocks." in name and "residual." not in name: + layer_id = int(name[name.find("blocks.") :].split(".")[1]) + 1 + + return lr_decay_rate ** (num_layers + 1 - layer_id) + + +def get_params_groups_with_decay(model, lr_decay_rate=1.0, patch_embed_lr_mult=1.0): + chunked_blocks = False + if hasattr(model, "n_blocks"): + logger.info("chunked fsdp") + n_blocks = model.n_blocks + chunked_blocks = model.chunked_blocks + elif hasattr(model, "blocks"): + logger.info("first code branch") + n_blocks = len(model.blocks) + elif hasattr(model, "backbone"): + logger.info("second code branch") + n_blocks = len(model.backbone.blocks) + else: + logger.info("else code branch") + n_blocks = 0 + all_param_groups = [] + + for name, param in model.named_parameters(): + name = name.replace("_fsdp_wrapped_module.", "") + if not param.requires_grad: + continue + decay_rate = get_vit_lr_decay_rate( + name, lr_decay_rate, num_layers=n_blocks, force_is_backbone=n_blocks > 0, chunked_blocks=chunked_blocks + ) + d = {"params": param, "is_last_layer": False, "lr_multiplier": decay_rate, "wd_multiplier": 1.0, "name": name} + + if "last_layer" in name: + d.update({"is_last_layer": True}) + + if name.endswith(".bias") or "norm" in name or "gamma" in name: + d.update({"wd_multiplier": 0.0}) + + if "patch_embed" in name: + d.update({"lr_multiplier": d["lr_multiplier"] * patch_embed_lr_mult}) + + all_param_groups.append(d) + logger.info(f"""{name}: lr_multiplier: {d["lr_multiplier"]}, wd_multiplier: {d["wd_multiplier"]}""") + + return all_param_groups + + +def fuse_params_groups(all_params_groups, keys=("lr_multiplier", "wd_multiplier", "is_last_layer")): + fused_params_groups = defaultdict(lambda: {"params": []}) + for d in all_params_groups: + identifier = "" + for k in keys: + identifier += k + str(d[k]) + "_" + + for k in keys: + fused_params_groups[identifier][k] = d[k] + fused_params_groups[identifier]["params"].append(d["params"]) + + return fused_params_groups.values() diff --git a/prismatic/models/pi3/models/dinov2/utils/utils.py b/prismatic/models/pi3/models/dinov2/utils/utils.py new file mode 100644 index 0000000..e8842e4 --- /dev/null +++ b/prismatic/models/pi3/models/dinov2/utils/utils.py @@ -0,0 +1,95 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +import logging +import os +import random +import subprocess +from urllib.parse import urlparse + +import numpy as np +import torch +from torch import nn + + +# logger = logging.getLogger("dinov2") + + +def load_pretrained_weights(model, pretrained_weights, checkpoint_key): + if urlparse(pretrained_weights).scheme: # If it looks like an URL + state_dict = torch.hub.load_state_dict_from_url(pretrained_weights, map_location="cpu") + else: + state_dict = torch.load(pretrained_weights, map_location="cpu") + if checkpoint_key is not None and checkpoint_key in state_dict: + # logger.info(f"Take key {checkpoint_key} in provided checkpoint dict") + state_dict = state_dict[checkpoint_key] + # remove `module.` prefix + state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()} + # remove `backbone.` prefix induced by multicrop wrapper + state_dict = {k.replace("backbone.", ""): v for k, v in state_dict.items()} + msg = model.load_state_dict(state_dict, strict=False) + # logger.info("Pretrained weights found at {} and loaded with msg: {}".format(pretrained_weights, msg)) + + +def fix_random_seeds(seed=31): + """ + Fix random seeds. + """ + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + np.random.seed(seed) + random.seed(seed) + + +def get_sha(): + cwd = os.path.dirname(os.path.abspath(__file__)) + + def _run(command): + return subprocess.check_output(command, cwd=cwd).decode("ascii").strip() + + sha = "N/A" + diff = "clean" + branch = "N/A" + try: + sha = _run(["git", "rev-parse", "HEAD"]) + subprocess.check_output(["git", "diff"], cwd=cwd) + diff = _run(["git", "diff-index", "HEAD"]) + diff = "has uncommitted changes" if diff else "clean" + branch = _run(["git", "rev-parse", "--abbrev-ref", "HEAD"]) + except Exception: + pass + message = f"sha: {sha}, status: {diff}, branch: {branch}" + return message + + +class CosineScheduler(object): + def __init__(self, base_value, final_value, total_iters, warmup_iters=0, start_warmup_value=0, freeze_iters=0): + super().__init__() + self.final_value = final_value + self.total_iters = total_iters + + freeze_schedule = np.zeros((freeze_iters)) + + warmup_schedule = np.linspace(start_warmup_value, base_value, warmup_iters) + + iters = np.arange(total_iters - warmup_iters - freeze_iters) + schedule = final_value + 0.5 * (base_value - final_value) * (1 + np.cos(np.pi * iters / len(iters))) + self.schedule = np.concatenate((freeze_schedule, warmup_schedule, schedule)) + + assert len(self.schedule) == self.total_iters + + def __getitem__(self, it): + if it >= self.total_iters: + return self.final_value + else: + return self.schedule[it] + + +def has_batchnorms(model): + bn_types = (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm) + for name, module in model.named_modules(): + if isinstance(module, bn_types): + return True + return False diff --git a/prismatic/models/pi3/models/layers/attention.py b/prismatic/models/pi3/models/layers/attention.py new file mode 100644 index 0000000..728b27b --- /dev/null +++ b/prismatic/models/pi3/models/layers/attention.py @@ -0,0 +1,369 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py + +import logging +import os +import warnings + +from torch import Tensor +from torch import nn +import torch + +from torch.nn.functional import scaled_dot_product_attention +from torch.backends.cuda import SDPBackend + +XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None +try: + if XFORMERS_ENABLED: + from xformers.ops import memory_efficient_attention, unbind + + XFORMERS_AVAILABLE = True + # warnings.warn("xFormers is available (Attention)") + else: + # warnings.warn("xFormers is disabled (Attention)") + raise ImportError +except ImportError: + XFORMERS_AVAILABLE = False + # warnings.warn("xFormers is not available (Attention)") + + +class Attention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + proj_bias: bool = True, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + ) -> None: + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim, bias=proj_bias) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x: Tensor, attn_bias=None) -> Tensor: + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + + q, k, v = qkv[0] * self.scale, qkv[1], qkv[2] + attn = q @ k.transpose(-2, -1) + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class MemEffAttention(Attention): + def forward(self, x: Tensor, attn_bias=None) -> Tensor: + if not XFORMERS_AVAILABLE: + if attn_bias is not None: + raise AssertionError("xFormers is required for using nested tensors") + return super().forward(x) + + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads) + + # q, k, v = unbind(qkv, 2) + q, k, v = [qkv[:,:,i] for i in range(3)] + + x = memory_efficient_attention(q, k, v, attn_bias=attn_bias) + x = x.reshape([B, N, C]) + + x = self.proj(x) + x = self.proj_drop(x) + return x + + + +class FlashAttention(Attention): + def forward(self, x: Tensor, attn_bias=None) -> Tensor: + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).transpose(1, 3) + + # q, k, v = unbind(qkv, 2) + q, k, v = [qkv[:,:,i] for i in range(3)] + + if q.dtype == torch.bfloat16: + with nn.attention.sdpa_kernel(SDPBackend.FLASH_ATTENTION): + x = scaled_dot_product_attention(q, k, v) + else: + with nn.attention.sdpa_kernel([SDPBackend.MATH, SDPBackend.EFFICIENT_ATTENTION]): + x = scaled_dot_product_attention(q, k, v) + + x = x.transpose(1, 2).reshape([B, N, C]) + + x = self.proj(x) + x = self.proj_drop(x) + return x + + +""" +Following is written by GPT-4o +""" +class CrossAttentionRope(nn.Module): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + proj_bias: bool = True, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + qk_norm: bool = False, + norm_layer: nn.Module = nn.LayerNorm, + rope=None, + ) -> None: + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim**-0.5 + + # Separate projection layers for query, key, and value + self.q_proj = nn.Linear(dim, dim, bias=qkv_bias) + self.k_proj = nn.Linear(dim, dim, bias=qkv_bias) + self.v_proj = nn.Linear(dim, dim, bias=qkv_bias) + + self.q_norm = norm_layer(head_dim) if qk_norm else nn.Identity() + self.k_norm = norm_layer(head_dim) if qk_norm else nn.Identity() + + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim, bias=proj_bias) + self.proj_drop = nn.Dropout(proj_drop) + + self.rope = rope + + def forward(self, query: Tensor, key: Tensor, value: Tensor, attn_bias=None, qpos=None, kpos=None) -> Tensor: + """ + Args: + query: Tensor of shape (B, N, C), input query + key: Tensor of shape (B, M, C), input key + value: Tensor of shape (B, M, C), input value + attn_bias: Optional tensor for attention bias + Returns: + Tensor of shape (B, N, C), output of cross-attention + """ + B, N, C = query.shape + _, M, _ = key.shape + + # Project query, key, and value + q = self.q_proj(query).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3) + k = self.k_proj(key).reshape(B, M, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3) + v = self.v_proj(value).reshape(B, M, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3) + q, k = self.q_norm(q).to(v.dtype), self.k_norm(k).to(v.dtype) + + if self.rope is not None: + q = self.rope(q, qpos) + k = self.rope(k, kpos) + + # Scale query + q = q * self.scale + + # Compute attention scores + attn = q @ k.transpose(-2, -1) # (B, num_heads, N, M) + if attn_bias is not None: + attn = attn + attn_bias + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + # Compute attention output + x = (attn @ v).transpose(1, 2).reshape(B, N, C) # (B, N, C) + + # Final projection + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class MemEffCrossAttentionRope(CrossAttentionRope): + def forward(self, query: Tensor, key: Tensor, value: Tensor, attn_bias=None, qpos=None, kpos=None) -> Tensor: + """ + Args: + query: Tensor of shape (B, N, C), input query + key: Tensor of shape (B, M, C), input key + value: Tensor of shape (B, M, C), input value + attn_bias: Optional tensor for attention bias + Returns: + Tensor of shape (B, N, C), output of cross-attention + """ + if not XFORMERS_AVAILABLE: + if attn_bias is not None: + raise AssertionError("xFormers is required for using nested tensors") + return super().forward(query, key, value, attn_bias) + + B, N, C = query.shape + _, M, _ = key.shape + + # Project query, key, and value + q = self.q_proj(query).reshape(B, N, self.num_heads, C // self.num_heads) + k = self.k_proj(key).reshape(B, M, self.num_heads, C // self.num_heads) + v = self.v_proj(value).reshape(B, M, self.num_heads, C // self.num_heads) + + q = q.transpose(1, 2) + k = k.transpose(1, 2) + q, k = self.q_norm(q).to(v.dtype), self.k_norm(k).to(v.dtype) + + if self.rope is not None: + q = self.rope(q, qpos) + k = self.rope(k, kpos) + + q = q.transpose(1, 2) + k = k.transpose(1, 2) + + # Compute memory-efficient attention + x = memory_efficient_attention(q, k, v, attn_bias=attn_bias) + x = x.reshape(B, N, C) + + # Final projection + x = self.proj(x) + x = self.proj_drop(x) + return x + +class AttentionRope(nn.Module): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + proj_bias: bool = True, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + qk_norm: bool = False, + norm_layer: nn.Module = nn.LayerNorm, + rope=None + ) -> None: + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim, bias=proj_bias) + self.proj_drop = nn.Dropout(proj_drop) + + self.q_norm = norm_layer(head_dim) if qk_norm else nn.Identity() + self.k_norm = norm_layer(head_dim) if qk_norm else nn.Identity() + + self.rope = rope + + def forward(self, x: Tensor, attn_bias=None, xpos=None) -> Tensor: + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] + q, k = self.q_norm(q).to(v.dtype), self.k_norm(k).to(v.dtype) + + if self.rope is not None: + q = self.rope(q, xpos) + k = self.rope(k, xpos) + + q = q * self.scale + attn = q @ k.transpose(-2, -1) + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class MemEffAttentionRope(AttentionRope): + def forward(self, x: Tensor, attn_bias=None, xpos=None) -> Tensor: + if not XFORMERS_AVAILABLE: + if attn_bias is not None: + raise AssertionError("xFormers is required for using nested tensors") + return super().forward(x) + + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads) + + qkv = qkv.transpose(1, 3) + # q, k, v = unbind(qkv, 2) + q, k, v = [qkv[:,:,i] for i in range(3)] + q, k = self.q_norm(q).to(v.dtype), self.k_norm(k).to(v.dtype) + + if self.rope is not None: + q = self.rope(q, xpos) + k = self.rope(k, xpos) + + q = q.transpose(1, 2) + k = k.transpose(1, 2) + v = v.transpose(1, 2) + + x = memory_efficient_attention(q, k, v, attn_bias=attn_bias) + x = x.reshape([B, N, C]) + + # score_matrix = (q.permute(0, 2, 1, 3) * self.scale @ k.permute(0, 2, 1, 3).transpose(-2, -1)).sum(dim=1).reshape(frame_num, 261, frame_num, 261).mean(dim=[1, 3]).sum(1) # for frame attention matrix + # global_valid_id = torch.where(score_matrix > 0) + # score_matrix = (q.permute(0, 2, 1, 3) * self.scale @ k.permute(0, 2, 1, 3).transpose(-2, -1)).sum(dim=1) + + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class FlashAttentionRope(AttentionRope): + def forward(self, x: Tensor, attn_bias=None, xpos=None) -> Tensor: + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).transpose(1, 3) + + # q, k, v = unbind(qkv, 2) + q, k, v = [qkv[:,:,i] for i in range(3)] + q, k = self.q_norm(q).to(v.dtype), self.k_norm(k).to(v.dtype) + + if self.rope is not None: + q = self.rope(q, xpos) + k = self.rope(k, xpos) + + if q.dtype == torch.bfloat16: + with nn.attention.sdpa_kernel(SDPBackend.FLASH_ATTENTION): + x = scaled_dot_product_attention(q, k, v) + else: + with nn.attention.sdpa_kernel([SDPBackend.MATH, SDPBackend.EFFICIENT_ATTENTION]): + x = scaled_dot_product_attention(q, k, v) + + x = x.transpose(1, 2).reshape([B, N, C]) + + x = self.proj(x) + x = self.proj_drop(x) + return x + +def get_attn_score(blk_class, x, frame_num, token_length, xpos=None): + x = blk_class.norm1(x) + + B, N, C = x.shape + qkv = blk_class.attn.qkv(x).reshape(B, N, 3, blk_class.attn.num_heads, C // blk_class.attn.num_heads) + + qkv = qkv.transpose(1, 3) + # q, k, v = unbind(qkv, 2) + q, k, v = [qkv[:,:,i] for i in range(3)] + q, k = blk_class.attn.q_norm(q).to(v.dtype), blk_class.attn.k_norm(k).to(v.dtype) + + if blk_class.attn.rope is not None: + q = blk_class.attn.rope(q, xpos) + k = blk_class.attn.rope(k, xpos) + + q = q.transpose(1, 2) + k = k.transpose(1, 2) + + score = (q.permute(0, 2, 1, 3) * blk_class.attn.scale @ k.permute(0, 2, 1, 3).transpose(-2, -1)).sum(dim=1).reshape(B, frame_num, token_length, frame_num, token_length).mean(dim=[2, 4]).sum(-1) + + return score \ No newline at end of file diff --git a/prismatic/models/pi3/models/layers/block.py b/prismatic/models/pi3/models/layers/block.py new file mode 100644 index 0000000..c2c1f95 --- /dev/null +++ b/prismatic/models/pi3/models/layers/block.py @@ -0,0 +1,406 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py + +import logging +import os +from typing import Callable, List, Any, Tuple, Dict +import warnings + +import torch +from torch import nn, Tensor + +from .attention import Attention, MemEffAttention, CrossAttentionRope, MemEffCrossAttentionRope, FlashAttentionRope +from ..dinov2.layers.drop_path import DropPath +from ..dinov2.layers.layer_scale import LayerScale +from ..dinov2.layers.mlp import Mlp + + +XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None +try: + if XFORMERS_ENABLED: + from xformers.ops import fmha, scaled_index_add, index_select_cat + + XFORMERS_AVAILABLE = True + # warnings.warn("xFormers is available (Block)") + else: + # warnings.warn("xFormers is disabled (Block)") + raise ImportError +except ImportError: + XFORMERS_AVAILABLE = False + # warnings.warn("xFormers is not available (Block)") + + +class Block(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + mlp_ratio: float = 4.0, + qkv_bias: bool = False, + proj_bias: bool = True, + ffn_bias: bool = True, + drop: float = 0.0, + attn_drop: float = 0.0, + init_values=None, + drop_path: float = 0.0, + act_layer: Callable[..., nn.Module] = nn.GELU, + norm_layer: Callable[..., nn.Module] = nn.LayerNorm, + attn_class: Callable[..., nn.Module] = Attention, + ffn_layer: Callable[..., nn.Module] = Mlp, + ) -> None: + super().__init__() + # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}") + self.norm1 = norm_layer(dim) + self.attn = attn_class( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + proj_bias=proj_bias, + attn_drop=attn_drop, + proj_drop=drop, + ) + + self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = ffn_layer( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + bias=ffn_bias, + ) + self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + self.sample_drop_ratio = drop_path + + def forward(self, x: Tensor) -> Tensor: + def attn_residual_func(x: Tensor) -> Tensor: + return self.ls1(self.attn(self.norm1(x))) + + def ffn_residual_func(x: Tensor) -> Tensor: + return self.ls2(self.mlp(self.norm2(x))) + + if self.training and self.sample_drop_ratio > 0.1: + # the overhead is compensated only for a drop path rate larger than 0.1 + x = drop_add_residual_stochastic_depth( + x, + residual_func=attn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + ) + x = drop_add_residual_stochastic_depth( + x, + residual_func=ffn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + ) + elif self.training and self.sample_drop_ratio > 0.0: + x = x + self.drop_path1(attn_residual_func(x)) + x = x + self.drop_path1(ffn_residual_func(x)) # FIXME: drop_path2 + else: + x = x + attn_residual_func(x) + x = x + ffn_residual_func(x) + return x + + +def drop_add_residual_stochastic_depth( + x: Tensor, + residual_func: Callable[[Tensor], Tensor], + sample_drop_ratio: float = 0.0, +) -> Tensor: + # 1) extract subset using permutation + b, n, d = x.shape + sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1) + brange = (torch.randperm(b, device=x.device))[:sample_subset_size] + x_subset = x[brange] + + # 2) apply residual_func to get residual + residual = residual_func(x_subset) + + x_flat = x.flatten(1) + residual = residual.flatten(1) + + residual_scale_factor = b / sample_subset_size + + # 3) add the residual + x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor) + return x_plus_residual.view_as(x) + + +def get_branges_scales(x, sample_drop_ratio=0.0): + b, n, d = x.shape + sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1) + brange = (torch.randperm(b, device=x.device))[:sample_subset_size] + residual_scale_factor = b / sample_subset_size + return brange, residual_scale_factor + + +def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None): + if scaling_vector is None: + x_flat = x.flatten(1) + residual = residual.flatten(1) + x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor) + else: + x_plus_residual = scaled_index_add( + x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor + ) + return x_plus_residual + + +attn_bias_cache: Dict[Tuple, Any] = {} + + +def get_attn_bias_and_cat(x_list, branges=None): + """ + this will perform the index select, cat the tensors, and provide the attn_bias from cache + """ + batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list] + all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list)) + if all_shapes not in attn_bias_cache.keys(): + seqlens = [] + for b, x in zip(batch_sizes, x_list): + for _ in range(b): + seqlens.append(x.shape[1]) + attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens) + attn_bias._batch_sizes = batch_sizes + attn_bias_cache[all_shapes] = attn_bias + + if branges is not None: + cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1]) + else: + tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list) + cat_tensors = torch.cat(tensors_bs1, dim=1) + + return attn_bias_cache[all_shapes], cat_tensors + + +def drop_add_residual_stochastic_depth_list( + x_list: List[Tensor], + residual_func: Callable[[Tensor, Any], Tensor], + sample_drop_ratio: float = 0.0, + scaling_vector=None, +) -> Tensor: + # 1) generate random set of indices for dropping samples in the batch + branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list] + branges = [s[0] for s in branges_scales] + residual_scale_factors = [s[1] for s in branges_scales] + + # 2) get attention bias and index+concat the tensors + attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges) + + # 3) apply residual_func to get residual, and split the result + residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias)) # type: ignore + + outputs = [] + for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors): + outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x)) + return outputs + + +class NestedTensorBlock(Block): + def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]: + """ + x_list contains a list of tensors to nest together and run + """ + assert isinstance(self.attn, MemEffAttention) + + if self.training and self.sample_drop_ratio > 0.0: + + def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.attn(self.norm1(x), attn_bias=attn_bias) + + def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.mlp(self.norm2(x)) + + x_list = drop_add_residual_stochastic_depth_list( + x_list, + residual_func=attn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None, + ) + x_list = drop_add_residual_stochastic_depth_list( + x_list, + residual_func=ffn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None, + ) + return x_list + else: + + def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias)) + + def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.ls2(self.mlp(self.norm2(x))) + + attn_bias, x = get_attn_bias_and_cat(x_list) + x = x + attn_residual_func(x, attn_bias=attn_bias) + x = x + ffn_residual_func(x) + return attn_bias.split(x) + + def forward(self, x_or_x_list): + if isinstance(x_or_x_list, Tensor): + return super().forward(x_or_x_list) + elif isinstance(x_or_x_list, list): + if not XFORMERS_AVAILABLE: + raise AssertionError("xFormers is required for using nested tensors") + return self.forward_nested(x_or_x_list) + else: + raise AssertionError + +class BlockRope(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + mlp_ratio: float = 4.0, + qkv_bias: bool = False, + proj_bias: bool = True, + ffn_bias: bool = True, + drop: float = 0.0, + attn_drop: float = 0.0, + init_values=None, + drop_path: float = 0.0, + act_layer: Callable[..., nn.Module] = nn.GELU, + norm_layer: Callable[..., nn.Module] = nn.LayerNorm, + attn_class: Callable[..., nn.Module] = Attention, + ffn_layer: Callable[..., nn.Module] = Mlp, + qk_norm: bool=False, + rope=None + ) -> None: + super().__init__() + # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}") + self.norm1 = norm_layer(dim) + self.attn = attn_class( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + proj_bias=proj_bias, + attn_drop=attn_drop, + proj_drop=drop, + qk_norm=qk_norm, + rope=rope + ) + + self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = ffn_layer( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + bias=ffn_bias, + ) + self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + self.sample_drop_ratio = drop_path + + def forward(self, x: Tensor, xpos=None) -> Tensor: + def attn_residual_func(x: Tensor) -> Tensor: + return self.ls1(self.attn(self.norm1(x), xpos=xpos)) + + def ffn_residual_func(x: Tensor) -> Tensor: + return self.ls2(self.mlp(self.norm2(x))) + + if self.training and self.sample_drop_ratio > 0.1: + # the overhead is compensated only for a drop path rate larger than 0.1 + x = drop_add_residual_stochastic_depth( + x, + residual_func=attn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + ) + x = drop_add_residual_stochastic_depth( + x, + residual_func=ffn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + ) + elif self.training and self.sample_drop_ratio > 0.0: + x = x + self.drop_path1(attn_residual_func(x)) + x = x + self.drop_path1(ffn_residual_func(x)) # FIXME: drop_path2 + else: + x = x + attn_residual_func(x) + x = x + ffn_residual_func(x) + return x + + +class CrossBlockRope(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + mlp_ratio: float = 4.0, + qkv_bias: bool = False, + proj_bias: bool = True, + ffn_bias: bool = True, + act_layer: Callable[..., nn.Module] = nn.GELU, + norm_layer: Callable[..., nn.Module] = nn.LayerNorm, + attn_class: Callable[..., nn.Module] = Attention, + cross_attn_class: Callable[..., nn.Module] = CrossAttentionRope, + ffn_layer: Callable[..., nn.Module] = Mlp, + init_values=None, + qk_norm: bool=False, + rope=None + ) -> None: + super().__init__() + # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}") + self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + self.norm1 = norm_layer(dim) + self.attn = attn_class( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + proj_bias=proj_bias, + rope=rope, + qk_norm=qk_norm + ) + + self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + self.ls_y = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + self.norm2 = norm_layer(dim) + self.norm_y = norm_layer(dim) + self.cross_attn = cross_attn_class( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + proj_bias=proj_bias, + rope=rope, + qk_norm=qk_norm + ) + + self.norm3 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = ffn_layer( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + bias=ffn_bias, + ) + + def forward(self, x: Tensor, y: Tensor, xpos=None, ypos=None) -> Tensor: + def attn_residual_func(x: Tensor) -> Tensor: + return self.ls1(self.attn(self.norm1(x), xpos=xpos)) + + def cross_attn_residual_func(x: Tensor, y: Tensor) -> Tensor: + return self.ls_y(self.cross_attn(self.norm2(x), y, y, qpos=xpos, kpos=ypos)) + + def ffn_residual_func(x: Tensor) -> Tensor: + return self.ls2(self.mlp(self.norm3(x))) + + x = x + attn_residual_func(x) + y_ = self.norm_y(y) + x = x + cross_attn_residual_func(x, y_) + x = x + ffn_residual_func(x) + + return x \ No newline at end of file diff --git a/prismatic/models/pi3/models/layers/camera_head.py b/prismatic/models/pi3/models/layers/camera_head.py new file mode 100644 index 0000000..7d844f7 --- /dev/null +++ b/prismatic/models/pi3/models/layers/camera_head.py @@ -0,0 +1,93 @@ +import torch +import torch.nn as nn +from copy import deepcopy +import torch.nn.functional as F + +# code adapted from 'https://github.com/nianticlabs/marepo/blob/9a45e2bb07e5bb8cb997620088d352b439b13e0e/transformer/transformer.py#L172' +class ResConvBlock(nn.Module): + """ + 1x1 convolution residual block + """ + def __init__(self, in_channels, out_channels): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.head_skip = nn.Identity() if self.in_channels == self.out_channels else nn.Conv2d(self.in_channels, self.out_channels, 1, 1, 0) + # self.res_conv1 = nn.Conv2d(self.in_channels, self.out_channels, 1, 1, 0) + # self.res_conv2 = nn.Conv2d(self.out_channels, self.out_channels, 1, 1, 0) + # self.res_conv3 = nn.Conv2d(self.out_channels, self.out_channels, 1, 1, 0) + + # change 1x1 convolution to linear + self.res_conv1 = nn.Linear(self.in_channels, self.out_channels) + self.res_conv2 = nn.Linear(self.out_channels, self.out_channels) + self.res_conv3 = nn.Linear(self.out_channels, self.out_channels) + + def forward(self, res): + x = F.relu(self.res_conv1(res)) + x = F.relu(self.res_conv2(x)) + x = F.relu(self.res_conv3(x)) + res = self.head_skip(res) + x + return res + +class CameraHead(nn.Module): + def __init__(self, dim=512): + super().__init__() + output_dim = dim + self.res_conv = nn.ModuleList([deepcopy(ResConvBlock(output_dim, output_dim)) + for _ in range(2)]) + self.avgpool = nn.AdaptiveAvgPool2d(1) + self.more_mlps = nn.Sequential( + nn.Linear(output_dim,output_dim), + nn.ReLU(), + nn.Linear(output_dim,output_dim), + nn.ReLU() + ) + self.fc_t = nn.Linear(output_dim, 3) + self.fc_rot = nn.Linear(output_dim, 9) + + def forward(self, feat, patch_h, patch_w): + BN, hw, c = feat.shape + + for i in range(2): + feat = self.res_conv[i](feat) + + # feat = self.avgpool(feat) + feat = self.avgpool(feat.permute(0, 2, 1).reshape(BN, -1, patch_h, patch_w).contiguous()) ########## + feat = feat.view(feat.size(0), -1) + + feat = self.more_mlps(feat) # [B, D_] + with torch.amp.autocast(device_type='cuda', enabled=False): + out_t = self.fc_t(feat.float()) # [B,3] + out_r = self.fc_rot(feat.float()) # [B,9] + pose = self.convert_pose_to_4x4(BN, out_r, out_t, feat.device) + + return pose + + def convert_pose_to_4x4(self, B, out_r, out_t, device): + out_r = self.svd_orthogonalize(out_r) # [N,3,3] + pose = torch.zeros((B, 4, 4), device=device) + pose[:, :3, :3] = out_r + pose[:, :3, 3] = out_t + pose[:, 3, 3] = 1. + return pose + + def svd_orthogonalize(self, m): + """Convert 9D representation to SO(3) using SVD orthogonalization. + + Args: + m: [BATCH, 3, 3] 3x3 matrices. + + Returns: + [BATCH, 3, 3] SO(3) rotation matrices. + """ + if m.dim() < 3: + m = m.reshape((-1, 3, 3)) + m_transpose = torch.transpose(torch.nn.functional.normalize(m, p=2, dim=-1), dim0=-1, dim1=-2) + u, s, v = torch.svd(m_transpose) + det = torch.det(torch.matmul(v, u.transpose(-2, -1))) + # Check orientation reflection. + r = torch.matmul( + torch.cat([v[:, :, :-1], v[:, :, -1:] * det.view(-1, 1, 1)], dim=2), + u.transpose(-2, -1) + ) + return r \ No newline at end of file diff --git a/prismatic/models/pi3/models/layers/pos_embed.py b/prismatic/models/pi3/models/layers/pos_embed.py new file mode 100644 index 0000000..e27ea0f --- /dev/null +++ b/prismatic/models/pi3/models/layers/pos_embed.py @@ -0,0 +1,174 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + + +# -------------------------------------------------------- +# Position embedding utils +# -------------------------------------------------------- + + + +import numpy as np + +import torch + +# -------------------------------------------------------- +# 2D sine-cosine position embedding +# References: +# MAE: https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py +# Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py +# MoCo v3: https://github.com/facebookresearch/moco-v3 +# -------------------------------------------------------- +def get_2d_sincos_pos_embed(embed_dim, grid_size, n_cls_token=0): + """ + grid_size: int of the grid height and width + return: + pos_embed: [grid_size*grid_size, embed_dim] or [n_cls_token+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) + """ + grid_h = np.arange(grid_size, dtype=np.float32) + grid_w = np.arange(grid_size, dtype=np.float32) + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) + + grid = grid.reshape([2, 1, grid_size, grid_size]) + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) + if n_cls_token>0: + pos_embed = np.concatenate([np.zeros([n_cls_token, embed_dim]), pos_embed], axis=0) + return pos_embed + + +def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): + assert embed_dim % 2 == 0 + + # use half of dimensions to encode grid_h + emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) + + emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) + return emb + + +def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (M,) + out: (M, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=float) + omega /= embed_dim / 2. + omega = 1. / 10000**omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product + + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + + emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) + return emb + + +# -------------------------------------------------------- +# Interpolate position embeddings for high-resolution +# References: +# MAE: https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py +# DeiT: https://github.com/facebookresearch/deit +# -------------------------------------------------------- +def interpolate_pos_embed(model, checkpoint_model): + if 'pos_embed' in checkpoint_model: + pos_embed_checkpoint = checkpoint_model['pos_embed'] + embedding_size = pos_embed_checkpoint.shape[-1] + num_patches = model.patch_embed.num_patches + num_extra_tokens = model.pos_embed.shape[-2] - num_patches + # height (== width) for the checkpoint position embedding + orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) + # height (== width) for the new position embedding + new_size = int(num_patches ** 0.5) + # class_token and dist_token are kept unchanged + if orig_size != new_size: + print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size)) + extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] + # only the position tokens are interpolated + pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] + pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2) + pos_tokens = torch.nn.functional.interpolate( + pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False) + pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) + new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) + checkpoint_model['pos_embed'] = new_pos_embed + + +#---------------------------------------------------------- +# RoPE2D: RoPE implementation in 2D +#---------------------------------------------------------- + +try: + from models.curope import cuRoPE2D + RoPE2D = cuRoPE2D +except ImportError: + print('Warning, cannot find cuda-compiled version of RoPE2D, using a slow pytorch version instead') + + class RoPE2D(torch.nn.Module): + + def __init__(self, freq=100.0, F0=1.0): + super().__init__() + self.base = freq + self.F0 = F0 + self.cache = {} + + def get_cos_sin(self, D, seq_len, device, dtype): + if (D,seq_len,device,dtype) not in self.cache: + inv_freq = 1.0 / (self.base ** (torch.arange(0, D, 2).float().to(device) / D)) + t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) + freqs = torch.einsum("i,j->ij", t, inv_freq).to(dtype) + freqs = torch.cat((freqs, freqs), dim=-1) + cos = freqs.cos() # (Seq, Dim) + sin = freqs.sin() + self.cache[D,seq_len,device,dtype] = (cos,sin) + return self.cache[D,seq_len,device,dtype] + + @staticmethod + def rotate_half(x): + x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + def apply_rope1d(self, tokens, pos1d, cos, sin): + assert pos1d.ndim==2 + cos = torch.nn.functional.embedding(pos1d, cos)[:, None, :, :] + sin = torch.nn.functional.embedding(pos1d, sin)[:, None, :, :] + return (tokens * cos) + (self.rotate_half(tokens) * sin) + + def forward(self, tokens, positions): + """ + input: + * tokens: batch_size x nheads x ntokens x dim + * positions: batch_size x ntokens x 2 (y and x position of each token) + output: + * tokens after appplying RoPE2D (batch_size x nheads x ntokens x dim) + """ + assert tokens.size(3)%2==0, "number of dimensions should be a multiple of two" + D = tokens.size(3) // 2 + assert positions.ndim==3 and positions.shape[-1] == 2 # Batch, Seq, 2 + cos, sin = self.get_cos_sin(D, int(positions.max())+1, tokens.device, tokens.dtype) + # split features into two along the feature dimension, and apply rope1d on each half + y, x = tokens.chunk(2, dim=-1) + y = self.apply_rope1d(y, positions[:,:,0], cos, sin) + x = self.apply_rope1d(x, positions[:,:,1], cos, sin) + tokens = torch.cat((y, x), dim=-1) + return tokens + +# patch embedding +class PositionGetter(object): + """ return positions of patches """ + + def __init__(self): + self.cache_positions = {} + + def __call__(self, b, h, w, device): + if not (h,w) in self.cache_positions: + x = torch.arange(w, device=device) + y = torch.arange(h, device=device) + self.cache_positions[h,w] = torch.cartesian_prod(y, x) # (h, w, 2) + pos = self.cache_positions[h,w].view(1, h*w, 2).expand(b, -1, 2).clone() + return pos \ No newline at end of file diff --git a/prismatic/models/pi3/models/layers/transformer_head.py b/prismatic/models/pi3/models/layers/transformer_head.py new file mode 100644 index 0000000..8b03892 --- /dev/null +++ b/prismatic/models/pi3/models/layers/transformer_head.py @@ -0,0 +1,81 @@ +from .attention import FlashAttentionRope +from .block import BlockRope +from ..dinov2.layers import Mlp +import torch.nn as nn +from functools import partial +from torch.utils.checkpoint import checkpoint +import torch.nn.functional as F + +class TransformerDecoder(nn.Module): + def __init__( + self, + in_dim, + out_dim, + dec_embed_dim=512, + depth=5, + dec_num_heads=8, + mlp_ratio=4, + rope=None, + need_project=True, + use_checkpoint=False, + ): + super().__init__() + + self.projects = nn.Linear(in_dim, dec_embed_dim) if need_project else nn.Identity() + self.use_checkpoint = use_checkpoint + + self.blocks = nn.ModuleList([ + BlockRope( + dim=dec_embed_dim, + num_heads=dec_num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=True, + proj_bias=True, + ffn_bias=True, + drop_path=0.0, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + act_layer=nn.GELU, + ffn_layer=Mlp, + init_values=None, + qk_norm=False, + # attn_class=MemEffAttentionRope, + attn_class=FlashAttentionRope, + rope=rope + ) for _ in range(depth)]) + + self.linear_out = nn.Linear(dec_embed_dim, out_dim) + + def forward(self, hidden, xpos=None): + hidden = self.projects(hidden) + for i, blk in enumerate(self.blocks): + if self.use_checkpoint and self.training: + hidden = checkpoint(blk, hidden, xpos=xpos, use_reentrant=False) + else: + hidden = blk(hidden, xpos=xpos) + out = self.linear_out(hidden) + return out + +class LinearPts3d (nn.Module): + """ + Linear head for dust3r + Each token outputs: - 16x16 3D points (+ confidence) + """ + + def __init__(self, patch_size, dec_embed_dim, output_dim=3,): + super().__init__() + self.patch_size = patch_size + + self.proj = nn.Linear(dec_embed_dim, (output_dim)*self.patch_size**2) + + def forward(self, decout, img_shape): + H, W = img_shape + tokens = decout[-1] + B, S, D = tokens.shape + + # extract 3D points + feat = self.proj(tokens) # B,S,D + feat = feat.transpose(-1, -2).view(B, -1, H//self.patch_size, W//self.patch_size) + feat = F.pixel_shuffle(feat, self.patch_size) # B,3,H,W + + # permute + norm depth + return feat.permute(0, 2, 3, 1) \ No newline at end of file diff --git a/prismatic/models/pi3/models/pi3.py b/prismatic/models/pi3/models/pi3.py new file mode 100644 index 0000000..917c6cc --- /dev/null +++ b/prismatic/models/pi3/models/pi3.py @@ -0,0 +1,216 @@ +import torch +import torch.nn as nn +from functools import partial +from copy import deepcopy + +from .dinov2.layers import Mlp +from ..utils.geometry import homogenize_points +from .layers.pos_embed import RoPE2D, PositionGetter +from .layers.block import BlockRope +from .layers.attention import FlashAttentionRope +from .layers.transformer_head import TransformerDecoder, LinearPts3d +from .layers.camera_head import CameraHead +from .dinov2.hub.backbones import dinov2_vitl14, dinov2_vitl14_reg +from huggingface_hub import PyTorchModelHubMixin + +class Pi3(nn.Module, PyTorchModelHubMixin): + def __init__( + self, + pos_type='rope100', + decoder_size='large', + ): + super().__init__() + + # ---------------------- + # Encoder + # ---------------------- + self.encoder = dinov2_vitl14_reg(pretrained=False) + self.patch_size = 14 + del self.encoder.mask_token + + # ---------------------- + # Positonal Encoding + # ---------------------- + self.pos_type = pos_type if pos_type is not None else 'none' + self.rope=None + if self.pos_type.startswith('rope'): # eg rope100 + if RoPE2D is None: raise ImportError("Cannot find cuRoPE2D, please install it following the README instructions") + freq = float(self.pos_type[len('rope'):]) + self.rope = RoPE2D(freq=freq) + self.position_getter = PositionGetter() + else: + raise NotImplementedError + + + # ---------------------- + # Decoder + # ---------------------- + enc_embed_dim = self.encoder.blocks[0].attn.qkv.in_features # 1024 + if decoder_size == 'small': + dec_embed_dim = 384 + dec_num_heads = 6 + mlp_ratio = 4 + dec_depth = 24 + elif decoder_size == 'base': + dec_embed_dim = 768 + dec_num_heads = 12 + mlp_ratio = 4 + dec_depth = 24 + elif decoder_size == 'large': + dec_embed_dim = 1024 + dec_num_heads = 16 + mlp_ratio = 4 + dec_depth = 36 + else: + raise NotImplementedError + self.decoder = nn.ModuleList([ + BlockRope( + dim=dec_embed_dim, + num_heads=dec_num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=True, + proj_bias=True, + ffn_bias=True, + drop_path=0.0, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + act_layer=nn.GELU, + ffn_layer=Mlp, + init_values=0.01, + qk_norm=True, + attn_class=FlashAttentionRope, + rope=self.rope + ) for _ in range(dec_depth)]) + self.dec_embed_dim = dec_embed_dim + + # ---------------------- + # Register_token + # ---------------------- + num_register_tokens = 5 + self.patch_start_idx = num_register_tokens + self.register_token = nn.Parameter(torch.randn(1, 1, num_register_tokens, self.dec_embed_dim)) + nn.init.normal_(self.register_token, std=1e-6) + + # ---------------------- + # Local Points Decoder + # ---------------------- + self.point_decoder = TransformerDecoder( + in_dim=2*self.dec_embed_dim, + dec_embed_dim=1024, + dec_num_heads=16, + out_dim=1024, + rope=self.rope, + ) + self.point_head = LinearPts3d(patch_size=14, dec_embed_dim=1024, output_dim=3) + + # ---------------------- + # Conf Decoder + # ---------------------- + self.conf_decoder = deepcopy(self.point_decoder) + self.conf_head = LinearPts3d(patch_size=14, dec_embed_dim=1024, output_dim=1) + + # ---------------------- + # Camera Pose Decoder + # ---------------------- + self.camera_decoder = TransformerDecoder( + in_dim=2*self.dec_embed_dim, + dec_embed_dim=1024, + dec_num_heads=16, # 8 + out_dim=512, + rope=self.rope, + use_checkpoint=False + ) + self.camera_head = CameraHead(dim=512) + + # For ImageNet Normalize + image_mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1) + image_std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1) + + self.register_buffer("image_mean", image_mean) + self.register_buffer("image_std", image_std) + + + def decode(self, hidden, N, H, W): + BN, hw, _ = hidden.shape + B = BN // N + + final_output = [] + + hidden = hidden.reshape(B*N, hw, -1) + + register_token = self.register_token.repeat(B, N, 1, 1).reshape(B*N, *self.register_token.shape[-2:]) + + # Concatenate special tokens with patch tokens + hidden = torch.cat([register_token, hidden], dim=1) + hw = hidden.shape[1] + + if self.pos_type.startswith('rope'): + pos = self.position_getter(B * N, H//self.patch_size, W//self.patch_size, hidden.device) + + if self.patch_start_idx > 0: + # do not use position embedding for special tokens (camera and register tokens) + # so set pos to 0 for the special tokens + pos = pos + 1 + pos_special = torch.zeros(B * N, self.patch_start_idx, 2).to(hidden.device).to(pos.dtype) + pos = torch.cat([pos_special, pos], dim=1) + + for i in range(len(self.decoder)): + blk = self.decoder[i] + + if i % 2 == 0: + pos = pos.reshape(B*N, hw, -1) + hidden = hidden.reshape(B*N, hw, -1) + else: + pos = pos.reshape(B, N*hw, -1) + hidden = hidden.reshape(B, N*hw, -1) + + hidden = blk(hidden, xpos=pos) + + if i+1 in [len(self.decoder)-1, len(self.decoder)]: + final_output.append(hidden.reshape(B*N, hw, -1)) + + return torch.cat([final_output[0], final_output[1]], dim=-1), pos.reshape(B*N, hw, -1) + + def forward(self, imgs): + imgs = (imgs - self.image_mean) / self.image_std + + B, N, _, H, W = imgs.shape + patch_h, patch_w = H // 14, W // 14 + + # encode by dinov2 + imgs = imgs.reshape(B*N, _, H, W) + hidden = self.encoder(imgs, is_training=True) + + if isinstance(hidden, dict): + hidden = hidden["x_norm_patchtokens"] + + hidden, pos = self.decode(hidden, N, H, W) + + point_hidden = self.point_decoder(hidden, xpos=pos) + conf_hidden = self.conf_decoder(hidden, xpos=pos) + camera_hidden = self.camera_decoder(hidden, xpos=pos) + + with torch.amp.autocast(device_type='cuda', enabled=False): + # local points + point_hidden = point_hidden.float() + ret = self.point_head([point_hidden[:, self.patch_start_idx:]], (H, W)).reshape(B, N, H, W, -1) + xy, z = ret.split([2, 1], dim=-1) + z = torch.exp(z) + local_points = torch.cat([xy * z, z], dim=-1) + + # confidence + conf_hidden = conf_hidden.float() + conf = self.conf_head([conf_hidden[:, self.patch_start_idx:]], (H, W)).reshape(B, N, H, W, -1) + + # camera + camera_hidden = camera_hidden.float() + camera_poses = self.camera_head(camera_hidden[:, self.patch_start_idx:], patch_h, patch_w).reshape(B, N, 4, 4) + + # unproject local points using camera poses + points = torch.einsum('bnij, bnhwj -> bnhwi', camera_poses, homogenize_points(local_points))[..., :3] + + return dict( + points=points, + local_points=local_points, + conf=conf, + camera_poses=camera_poses, + ) diff --git a/prismatic/models/pi3/utils/basic.py b/prismatic/models/pi3/utils/basic.py new file mode 100644 index 0000000..9ac7349 --- /dev/null +++ b/prismatic/models/pi3/utils/basic.py @@ -0,0 +1,223 @@ +import os +import os.path as osp +import math +import cv2 +from PIL import Image +import torch +from torchvision import transforms +from plyfile import PlyData, PlyElement +import numpy as np + +def load_images_as_tensor(path='data/truck', interval=1, PIXEL_LIMIT=255000): + """ + Loads images from a directory or video, resizes them to a uniform size, + then converts and stacks them into a single [N, 3, H, W] PyTorch tensor. + """ + sources = [] + + # --- 1. Load image paths or video frames --- + if osp.isdir(path): + print(f"Loading images from directory: {path}") + filenames = sorted([x for x in os.listdir(path) if x.lower().endswith(('.png', '.jpg', '.jpeg'))]) + for i in range(0, len(filenames), interval): + img_path = osp.join(path, filenames[i]) + try: + sources.append(Image.open(img_path).convert('RGB')) + except Exception as e: + print(f"Could not load image {filenames[i]}: {e}") + elif path.lower().endswith('.mp4'): + print(f"Loading frames from video: {path}") + cap = cv2.VideoCapture(path) + if not cap.isOpened(): raise IOError(f"Cannot open video file: {path}") + frame_idx = 0 + while True: + ret, frame = cap.read() + if not ret: break + if frame_idx % interval == 0: + rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + sources.append(Image.fromarray(rgb_frame)) + frame_idx += 1 + cap.release() + else: + raise ValueError(f"Unsupported path. Must be a directory or a .mp4 file: {path}") + + if not sources: + print("No images found or loaded.") + return torch.empty(0) + + print(f"Found {len(sources)} images/frames. Processing...") + + # --- 2. Determine a uniform target size for all images based on the first image --- + # This is necessary to ensure all tensors have the same dimensions for stacking. + first_img = sources[0] + W_orig, H_orig = first_img.size + scale = math.sqrt(PIXEL_LIMIT / (W_orig * H_orig)) if W_orig * H_orig > 0 else 1 + W_target, H_target = W_orig * scale, H_orig * scale + k, m = round(W_target / 14), round(H_target / 14) + while (k * 14) * (m * 14) > PIXEL_LIMIT: + if k / m > W_target / H_target: k -= 1 + else: m -= 1 + TARGET_W, TARGET_H = max(1, k) * 14, max(1, m) * 14 + print(f"All images will be resized to a uniform size: ({TARGET_W}, {TARGET_H})") + + # --- 3. Resize images and convert them to tensors in the [0, 1] range --- + tensor_list = [] + # Define a transform to convert a PIL Image to a CxHxW tensor and normalize to [0,1] + to_tensor_transform = transforms.ToTensor() + + for img_pil in sources: + try: + # Resize to the uniform target size + resized_img = img_pil.resize((TARGET_W, TARGET_H), Image.Resampling.LANCZOS) + # Convert to tensor + img_tensor = to_tensor_transform(resized_img) + tensor_list.append(img_tensor) + except Exception as e: + print(f"Error processing an image: {e}") + + if not tensor_list: + print("No images were successfully processed.") + return torch.empty(0) + + # --- 4. Stack the list of tensors into a single [N, C, H, W] batch tensor --- + return torch.stack(tensor_list, dim=0) + + +def tensor_to_pil(tensor): + """ + Converts a PyTorch tensor to a PIL image. Automatically moves the channel dimension + (if it has size 3) to the last axis before converting. + + Args: + tensor (torch.Tensor): Input tensor. Expected shape can be [C, H, W], [H, W, C], or [H, W]. + + Returns: + PIL.Image: The converted PIL image. + """ + if torch.is_tensor(tensor): + array = tensor.detach().cpu().numpy() + else: + array = tensor + + return array_to_pil(array) + + +def array_to_pil(array): + """ + Converts a NumPy array to a PIL image. Automatically: + - Squeezes dimensions of size 1. + - Moves the channel dimension (if it has size 3) to the last axis. + + Args: + array (np.ndarray): Input array. Expected shape can be [C, H, W], [H, W, C], or [H, W]. + + Returns: + PIL.Image: The converted PIL image. + """ + # Remove singleton dimensions + array = np.squeeze(array) + + # Ensure the array has the channel dimension as the last axis + if array.ndim == 3 and array.shape[0] == 3: # If the channel is the first axis + array = np.transpose(array, (1, 2, 0)) # Move channel to the last axis + + # Handle single-channel grayscale images + if array.ndim == 2: # [H, W] + return Image.fromarray((array * 255).astype(np.uint8), mode="L") + elif array.ndim == 3 and array.shape[2] == 3: # [H, W, C] with 3 channels + return Image.fromarray((array * 255).astype(np.uint8), mode="RGB") + else: + raise ValueError(f"Unsupported array shape for PIL conversion: {array.shape}") + + +def rotate_target_dim_to_last_axis(x, target_dim=3): + shape = x.shape + axis_to_move = -1 + # Iterate backwards to find the first occurrence from the end + # (which corresponds to the last dimension of size 3 in the original order). + for i in range(len(shape) - 1, -1, -1): + if shape[i] == target_dim: + axis_to_move = i + break + + # 2. If the axis is found and it's not already in the last position, move it. + if axis_to_move != -1 and axis_to_move != len(shape) - 1: + # Create the new dimension order. + dims_order = list(range(len(shape))) + dims_order.pop(axis_to_move) + dims_order.append(axis_to_move) + + # Use permute to reorder the dimensions. + ret = x.transpose(*dims_order) + else: + ret = x + + return ret + + +def write_ply( + xyz, + rgb=None, + path='output.ply', +) -> None: + if torch.is_tensor(xyz): + xyz = xyz.detach().cpu().numpy() + + if torch.is_tensor(rgb): + rgb = rgb.detach().cpu().numpy() + + if rgb is not None and rgb.max() > 1: + rgb = rgb / 255. + + xyz = rotate_target_dim_to_last_axis(xyz, 3) + xyz = xyz.reshape(-1, 3) + + if rgb is not None: + rgb = rotate_target_dim_to_last_axis(rgb, 3) + rgb = rgb.reshape(-1, 3) + + if rgb is None: + min_coord = np.min(xyz, axis=0) + max_coord = np.max(xyz, axis=0) + normalized_coord = (xyz - min_coord) / (max_coord - min_coord + 1e-8) + + hue = 0.7 * normalized_coord[:,0] + 0.2 * normalized_coord[:,1] + 0.1 * normalized_coord[:,2] + hsv = np.stack([hue, 0.9*np.ones_like(hue), 0.8*np.ones_like(hue)], axis=1) + + c = hsv[:,2:] * hsv[:,1:2] + x = c * (1 - np.abs( (hsv[:,0:1]*6) % 2 - 1 )) + m = hsv[:,2:] - c + + rgb = np.zeros_like(hsv) + cond = (0 <= hsv[:,0]*6%6) & (hsv[:,0]*6%6 < 1) + rgb[cond] = np.hstack([c[cond], x[cond], np.zeros_like(x[cond])]) + cond = (1 <= hsv[:,0]*6%6) & (hsv[:,0]*6%6 < 2) + rgb[cond] = np.hstack([x[cond], c[cond], np.zeros_like(x[cond])]) + cond = (2 <= hsv[:,0]*6%6) & (hsv[:,0]*6%6 < 3) + rgb[cond] = np.hstack([np.zeros_like(x[cond]), c[cond], x[cond]]) + cond = (3 <= hsv[:,0]*6%6) & (hsv[:,0]*6%6 < 4) + rgb[cond] = np.hstack([np.zeros_like(x[cond]), x[cond], c[cond]]) + cond = (4 <= hsv[:,0]*6%6) & (hsv[:,0]*6%6 < 5) + rgb[cond] = np.hstack([x[cond], np.zeros_like(x[cond]), c[cond]]) + cond = (5 <= hsv[:,0]*6%6) & (hsv[:,0]*6%6 < 6) + rgb[cond] = np.hstack([c[cond], np.zeros_like(x[cond]), x[cond]]) + rgb = (rgb + m) + + dtype = [ + ("x", "f4"), + ("y", "f4"), + ("z", "f4"), + ("nx", "f4"), + ("ny", "f4"), + ("nz", "f4"), + ("red", "u1"), + ("green", "u1"), + ("blue", "u1"), + ] + normals = np.zeros_like(xyz) + elements = np.empty(xyz.shape[0], dtype=dtype) + attributes = np.concatenate((xyz, normals, rgb * 255), axis=1) + elements[:] = list(map(tuple, attributes)) + vertex_element = PlyElement.describe(elements, "vertex") + ply_data = PlyData([vertex_element]) + ply_data.write(path) \ No newline at end of file diff --git a/prismatic/models/pi3/utils/debug.py b/prismatic/models/pi3/utils/debug.py new file mode 100644 index 0000000..f3da8f3 --- /dev/null +++ b/prismatic/models/pi3/utils/debug.py @@ -0,0 +1,63 @@ +import os +import json +import debugpy +import socket +import random + +def update_vscode_launch_file(host: str, port: int): + """Update the .vscode/launch.json file with the new host and port.""" + launch_file_path = ".vscode/launch.json" + # Desired configuration + new_config = { + "version": "0.2.0", + "configurations": [ + { + "name": "bash_debug", + "type": "debugpy", + "request": "attach", + "connect": { + "host": host, + "port": port + }, + "justMyCode": False + }, + ] + } + + # Ensure the .vscode directory exists + if not os.path.exists(".vscode"): + os.makedirs(".vscode") + + # Write the updated configuration to launch.json + with open(launch_file_path, "w") as f: + json.dump(new_config, f, indent=4) + print(f"Updated {launch_file_path} with host: {host} and port: {port}") + +def is_port_in_use(host, port): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + return s.connect_ex((host, port)) == 0 + +def setup_debug(is_main_process=True, max_retries=10, port_range=(10000, 20000)): + if is_main_process: + host = os.environ['SLURM_NODELIST'].split(',')[0] + + for _ in range(max_retries): + port = random.randint(*port_range) + try: + if is_port_in_use(host, port): + print(f"Port {port} is already in use, trying another...") + continue + + # 更新 launch.json + update_vscode_launch_file(host, port) + + print("master_addr = ", host) + debugpy.listen((host, port)) + print(f"Waiting for debugger attach at port {port}...", flush=True) + debugpy.wait_for_client() + print("Debugger attached", flush=True) + return + except Exception as e: + print(f"Failed to bind to port {port}: {e}") + + raise RuntimeError("Could not find a free port for debugpy after several attempts.") \ No newline at end of file diff --git a/prismatic/models/pi3/utils/geometry.py b/prismatic/models/pi3/utils/geometry.py new file mode 100644 index 0000000..515a36f --- /dev/null +++ b/prismatic/models/pi3/utils/geometry.py @@ -0,0 +1,375 @@ +import numpy as np +import torch +import torch.nn.functional as F + +def se3_inverse(T): + """ + Computes the inverse of a batch of SE(3) matrices. + T: Tensor of shape (B, 4, 4) + """ + if len(T.shape) == 2: + T = T[None] + unseq_flag = True + else: + unseq_flag = False + + if torch.is_tensor(T): + R = T[:, :3, :3] + t = T[:, :3, 3].unsqueeze(-1) + R_inv = R.transpose(-2, -1) + t_inv = -torch.matmul(R_inv, t) + T_inv = torch.cat([ + torch.cat([R_inv, t_inv], dim=-1), + torch.tensor([0, 0, 0, 1], device=T.device, dtype=T.dtype).repeat(T.shape[0], 1, 1) + ], dim=1) + else: + R = T[:, :3, :3] + t = T[:, :3, 3, np.newaxis] + + R_inv = np.swapaxes(R, -2, -1) + t_inv = -R_inv @ t + + bottom_row = np.zeros((T.shape[0], 1, 4), dtype=T.dtype) + bottom_row[:, :, 3] = 1 + + top_part = np.concatenate([R_inv, t_inv], axis=-1) + T_inv = np.concatenate([top_part, bottom_row], axis=1) + + if unseq_flag: + T_inv = T_inv[0] + return T_inv + +def get_pixel(H, W): + # get 2D pixels (u, v) for image_a in cam_a pixel space + u_a, v_a = np.meshgrid(np.arange(W), np.arange(H)) + # u_a = np.flip(u_a, axis=1) + # v_a = np.flip(v_a, axis=0) + pixels_a = np.stack([ + u_a.flatten() + 0.5, + v_a.flatten() + 0.5, + np.ones_like(u_a.flatten()) + ], axis=0) + + return pixels_a + +def depthmap_to_absolute_camera_coordinates(depthmap, camera_intrinsics, camera_pose, z_far=0, **kw): + """ + Args: + - depthmap (HxW array): + - camera_intrinsics: a 3x3 matrix + - camera_pose: a 4x3 or 4x4 cam2world matrix + Returns: + pointmap of absolute coordinates (HxWx3 array), and a mask specifying valid pixels.""" + X_cam, valid_mask = depthmap_to_camera_coordinates(depthmap, camera_intrinsics) + if z_far > 0: + valid_mask = valid_mask & (depthmap < z_far) + + X_world = X_cam # default + if camera_pose is not None: + # R_cam2world = np.float32(camera_params["R_cam2world"]) + # t_cam2world = np.float32(camera_params["t_cam2world"]).squeeze() + R_cam2world = camera_pose[:3, :3] + t_cam2world = camera_pose[:3, 3] + + # Express in absolute coordinates (invalid depth values) + X_world = np.einsum("ik, vuk -> vui", R_cam2world, X_cam) + t_cam2world[None, None, :] + + return X_world, valid_mask + + +def depthmap_to_camera_coordinates(depthmap, camera_intrinsics, pseudo_focal=None): + """ + Args: + - depthmap (HxW array): + - camera_intrinsics: a 3x3 matrix + Returns: + pointmap of absolute coordinates (HxWx3 array), and a mask specifying valid pixels. + """ + camera_intrinsics = np.float32(camera_intrinsics) + H, W = depthmap.shape + + # Compute 3D ray associated with each pixel + # Strong assumption: there are no skew terms + # assert camera_intrinsics[0, 1] == 0.0 + # assert camera_intrinsics[1, 0] == 0.0 + if pseudo_focal is None: + fu = camera_intrinsics[0, 0] + fv = camera_intrinsics[1, 1] + else: + assert pseudo_focal.shape == (H, W) + fu = fv = pseudo_focal + cu = camera_intrinsics[0, 2] + cv = camera_intrinsics[1, 2] + + u, v = np.meshgrid(np.arange(W), np.arange(H)) + z_cam = depthmap + x_cam = (u - cu) * z_cam / fu + y_cam = (v - cv) * z_cam / fv + X_cam = np.stack((x_cam, y_cam, z_cam), axis=-1).astype(np.float32) + + # Mask for valid coordinates + valid_mask = (depthmap > 0.0) + # Invalid any depth > 80m + valid_mask = valid_mask + return X_cam, valid_mask + +def homogenize_points( + points, +): + """Convert batched points (xyz) to (xyz1).""" + return torch.cat([points, torch.ones_like(points[..., :1])], dim=-1) + + +def get_gt_warp(depth1, depth2, T_1to2, K1, K2, depth_interpolation_mode = 'bilinear', relative_depth_error_threshold = 0.05, H = None, W = None): + + if H is None: + B,H,W = depth1.shape + else: + B = depth1.shape[0] + with torch.no_grad(): + x1_n = torch.meshgrid( + *[ + torch.linspace( + -1 + 1 / n, 1 - 1 / n, n, device=depth1.device + ) + for n in (B, H, W) + ], + indexing = 'ij' + ) + x1_n = torch.stack((x1_n[2], x1_n[1]), dim=-1).reshape(B, H * W, 2) + mask, x2 = warp_kpts( + x1_n.double(), + depth1.double(), + depth2.double(), + T_1to2.double(), + K1.double(), + K2.double(), + depth_interpolation_mode = depth_interpolation_mode, + relative_depth_error_threshold = relative_depth_error_threshold, + ) + prob = mask.float().reshape(B, H, W) + x2 = x2.reshape(B, H, W, 2) + return x2, prob + +@torch.no_grad() +def warp_kpts(kpts0, depth0, depth1, T_0to1, K0, K1, smooth_mask = False, return_relative_depth_error = False, depth_interpolation_mode = "bilinear", relative_depth_error_threshold = 0.05): + """Warp kpts0 from I0 to I1 with depth, K and Rt + Also check covisibility and depth consistency. + Depth is consistent if relative error < 0.2 (hard-coded). + # https://github.com/zju3dv/LoFTR/blob/94e98b695be18acb43d5d3250f52226a8e36f839/src/loftr/utils/geometry.py adapted from here + Args: + kpts0 (torch.Tensor): [N, L, 2] - , should be normalized in (-1,1) + depth0 (torch.Tensor): [N, H, W], + depth1 (torch.Tensor): [N, H, W], + T_0to1 (torch.Tensor): [N, 3, 4], + K0 (torch.Tensor): [N, 3, 3], + K1 (torch.Tensor): [N, 3, 3], + Returns: + calculable_mask (torch.Tensor): [N, L] + warped_keypoints0 (torch.Tensor): [N, L, 2] + """ + ( + n, + h, + w, + ) = depth0.shape + if depth_interpolation_mode == "combined": + # Inspired by approach in inloc, try to fill holes from bilinear interpolation by nearest neighbour interpolation + if smooth_mask: + raise NotImplementedError("Combined bilinear and NN warp not implemented") + valid_bilinear, warp_bilinear = warp_kpts(kpts0, depth0, depth1, T_0to1, K0, K1, + smooth_mask = smooth_mask, + return_relative_depth_error = return_relative_depth_error, + depth_interpolation_mode = "bilinear", + relative_depth_error_threshold = relative_depth_error_threshold) + valid_nearest, warp_nearest = warp_kpts(kpts0, depth0, depth1, T_0to1, K0, K1, + smooth_mask = smooth_mask, + return_relative_depth_error = return_relative_depth_error, + depth_interpolation_mode = "nearest-exact", + relative_depth_error_threshold = relative_depth_error_threshold) + nearest_valid_bilinear_invalid = (~valid_bilinear).logical_and(valid_nearest) + warp = warp_bilinear.clone() + warp[nearest_valid_bilinear_invalid] = warp_nearest[nearest_valid_bilinear_invalid] + valid = valid_bilinear | valid_nearest + return valid, warp + + + kpts0_depth = F.grid_sample(depth0[:, None], kpts0[:, :, None], mode = depth_interpolation_mode, align_corners=False)[ + :, 0, :, 0 + ] + kpts0 = torch.stack( + (w * (kpts0[..., 0] + 1) / 2, h * (kpts0[..., 1] + 1) / 2), dim=-1 + ) # [-1+1/h, 1-1/h] -> [0.5, h-0.5] + # Sample depth, get calculable_mask on depth != 0 + # nonzero_mask = kpts0_depth != 0 + # Sample depth, get calculable_mask on depth > 0 + nonzero_mask = kpts0_depth > 0 + + # Unproject + kpts0_h = ( + torch.cat([kpts0, torch.ones_like(kpts0[:, :, [0]])], dim=-1) + * kpts0_depth[..., None] + ) # (N, L, 3) + kpts0_n = K0.inverse() @ kpts0_h.transpose(2, 1) # (N, 3, L) + kpts0_cam = kpts0_n + + # Rigid Transform + w_kpts0_cam = T_0to1[:, :3, :3] @ kpts0_cam + T_0to1[:, :3, [3]] # (N, 3, L) + w_kpts0_depth_computed = w_kpts0_cam[:, 2, :] + + # Project + w_kpts0_h = (K1 @ w_kpts0_cam).transpose(2, 1) # (N, L, 3) + w_kpts0 = w_kpts0_h[:, :, :2] / ( + w_kpts0_h[:, :, [2]] + 1e-4 + ) # (N, L, 2), +1e-4 to avoid zero depth + + # Covisible Check + h, w = depth1.shape[1:3] + covisible_mask = ( + (w_kpts0[:, :, 0] > 0) + * (w_kpts0[:, :, 0] < w - 1) + * (w_kpts0[:, :, 1] > 0) + * (w_kpts0[:, :, 1] < h - 1) + ) + w_kpts0 = torch.stack( + (2 * w_kpts0[..., 0] / w - 1, 2 * w_kpts0[..., 1] / h - 1), dim=-1 + ) # from [0.5,h-0.5] -> [-1+1/h, 1-1/h] + # w_kpts0[~covisible_mask, :] = -5 # xd + + w_kpts0_depth = F.grid_sample( + depth1[:, None], w_kpts0[:, :, None], mode=depth_interpolation_mode, align_corners=False + )[:, 0, :, 0] + + relative_depth_error = ( + (w_kpts0_depth - w_kpts0_depth_computed) / w_kpts0_depth + ).abs() + if not smooth_mask: + consistent_mask = relative_depth_error < relative_depth_error_threshold + else: + consistent_mask = (-relative_depth_error/smooth_mask).exp() + valid_mask = nonzero_mask * covisible_mask * consistent_mask + if return_relative_depth_error: + return relative_depth_error, w_kpts0 + else: + return valid_mask, w_kpts0 + + +def geotrf(Trf, pts, ncol=None, norm=False): + """ Apply a geometric transformation to a list of 3-D points. + + H: 3x3 or 4x4 projection matrix (typically a Homography) + p: numpy/torch/tuple of coordinates. Shape must be (...,2) or (...,3) + + ncol: int. number of columns of the result (2 or 3) + norm: float. if != 0, the resut is projected on the z=norm plane. + + Returns an array of projected 2d points. + """ + assert Trf.ndim >= 2 + if isinstance(Trf, np.ndarray): + pts = np.asarray(pts) + elif isinstance(Trf, torch.Tensor): + pts = torch.as_tensor(pts, dtype=Trf.dtype) + + # adapt shape if necessary + output_reshape = pts.shape[:-1] + ncol = ncol or pts.shape[-1] + + # optimized code + if (isinstance(Trf, torch.Tensor) and isinstance(pts, torch.Tensor) and + Trf.ndim == 3 and pts.ndim == 4): + d = pts.shape[3] + if Trf.shape[-1] == d: + pts = torch.einsum("bij, bhwj -> bhwi", Trf, pts) + elif Trf.shape[-1] == d + 1: + pts = torch.einsum("bij, bhwj -> bhwi", Trf[:, :d, :d], pts) + Trf[:, None, None, :d, d] + else: + raise ValueError(f'bad shape, not ending with 3 or 4, for {pts.shape=}') + else: + if Trf.ndim >= 3: + n = Trf.ndim - 2 + assert Trf.shape[:n] == pts.shape[:n], 'batch size does not match' + Trf = Trf.reshape(-1, Trf.shape[-2], Trf.shape[-1]) + + if pts.ndim > Trf.ndim: + # Trf == (B,d,d) & pts == (B,H,W,d) --> (B, H*W, d) + pts = pts.reshape(Trf.shape[0], -1, pts.shape[-1]) + elif pts.ndim == 2: + # Trf == (B,d,d) & pts == (B,d) --> (B, 1, d) + pts = pts[:, None, :] + + if pts.shape[-1] + 1 == Trf.shape[-1]: + Trf = Trf.swapaxes(-1, -2) # transpose Trf + pts = pts @ Trf[..., :-1, :] + Trf[..., -1:, :] + elif pts.shape[-1] == Trf.shape[-1]: + Trf = Trf.swapaxes(-1, -2) # transpose Trf + pts = pts @ Trf + else: + pts = Trf @ pts.T + if pts.ndim >= 2: + pts = pts.swapaxes(-1, -2) + + if norm: + pts = pts / pts[..., -1:] # DONT DO /= BECAUSE OF WEIRD PYTORCH BUG + if norm != 1: + pts *= norm + + res = pts[..., :ncol].reshape(*output_reshape, ncol) + return res + + +def inv(mat): + """ Invert a torch or numpy matrix + """ + if isinstance(mat, torch.Tensor): + return torch.linalg.inv(mat) + if isinstance(mat, np.ndarray): + return np.linalg.inv(mat) + raise ValueError(f'bad matrix type = {type(mat)}') + +def opencv_camera_to_plucker(poses, K, H, W): + device = poses.device + B = poses.shape[0] + + pixel = torch.from_numpy(get_pixel(H, W).astype(np.float32)).to(device).T.reshape(H, W, 3)[None].repeat(B, 1, 1, 1) # (3, H, W) + pixel = torch.einsum('bij, bhwj -> bhwi', torch.inverse(K), pixel) + ray_directions = torch.einsum('bij, bhwj -> bhwi', poses[..., :3, :3], pixel) + + ray_origins = poses[..., :3, 3][:, None, None].repeat(1, H, W, 1) + + ray_directions = ray_directions / ray_directions.norm(dim=-1, keepdim=True) + plucker_normal = torch.cross(ray_origins, ray_directions, dim=-1) + plucker_ray = torch.cat([ray_directions, plucker_normal], dim=-1) + + return plucker_ray + + +def depth_edge(depth: torch.Tensor, atol: float = None, rtol: float = None, kernel_size: int = 3, mask: torch.Tensor = None) -> torch.BoolTensor: + """ + Compute the edge mask of a depth map. The edge is defined as the pixels whose neighbors have a large difference in depth. + + Args: + depth (torch.Tensor): shape (..., height, width), linear depth map + atol (float): absolute tolerance + rtol (float): relative tolerance + + Returns: + edge (torch.Tensor): shape (..., height, width) of dtype torch.bool + """ + shape = depth.shape + depth = depth.reshape(-1, 1, *shape[-2:]) + if mask is not None: + mask = mask.reshape(-1, 1, *shape[-2:]) + + if mask is None: + diff = (F.max_pool2d(depth, kernel_size, stride=1, padding=kernel_size // 2) + F.max_pool2d(-depth, kernel_size, stride=1, padding=kernel_size // 2)) + else: + diff = (F.max_pool2d(torch.where(mask, depth, -torch.inf), kernel_size, stride=1, padding=kernel_size // 2) + F.max_pool2d(torch.where(mask, -depth, -torch.inf), kernel_size, stride=1, padding=kernel_size // 2)) + + edge = torch.zeros_like(depth, dtype=torch.bool) + if atol is not None: + edge |= diff > atol + if rtol is not None: + edge |= (diff / depth).nan_to_num_() > rtol + edge = edge.reshape(*shape) + return edge \ No newline at end of file diff --git a/prismatic/models/vlas/__pycache__/__init__.cpython-310.pyc b/prismatic/models/vlas/__pycache__/__init__.cpython-310.pyc index 9a05ffb46260e32e782481ced0efba1bac40d596..66e0a147b34745ff54596119417684a399d2a3e3 100644 GIT binary patch delta 57 zcmX@ac$|?tpO=@50SLU-U6{z7tmLepk)NBYUsRfzk(!rYo|u=eUzU@oALiqz>zI;Q LP?B0SvD+K~@C*{c delta 52 zcmX@kc!-fZpO=@50SM$SpPR^?ENiDzI;Q LP?B0SagsRz^g9yW delta 52 zcmX@dc#@GjpO=@50SM$TpPR^?C2OZ&l%HRspIBOwpOT|nl3SpkUyzztmXoNPpH?z) Gx;X%yff0KE diff --git a/prismatic/models/vlms/__pycache__/base_vlm.cpython-310.pyc b/prismatic/models/vlms/__pycache__/base_vlm.cpython-310.pyc index 996f58d75c1b4f26bd28df8042cc726fc254afed..017e46bf8ee4817b8085c2486f3b6ff587423d81 100644 GIT binary patch delta 60 zcmcbra!Z9fpO=@50SLU-UD(KdkWI;1KO;XkRlle-Gb1%Gy*x25UB4_RQ9sPbQP(jg Ov7jWiX!A?99Bu$hfE0HC delta 55 zcmcbma#e*ppO=@50SM$TpWDcNkWJQ3zbHSyL_e{#BtIobw$?pOK%Ns$W!^nUR{8UY?kju3wgus2}FzsOy-L zSWuE$w0Q;l3?BJkylJI5Il5_?d8s9(d8t)=KvlY6p^~D+%)H6*B2FU0If=z3x;crJ SsYP&wlNX4HZ9Xk>K^*|iXDyQe delta 139 zcmZ2bIlGcOpO=@50SM$TpWDdYz%FaAUzDF;qMuk=lAn^JTasI#pI?xgSC*5go1a#) zc`f@49{DOh5I3zf2Pj{Zn3?yB7tBt}%u6jP%}bq}BI2Ya0#cHbSX=@mDpQLf3af;X NL}1D{Ul6&V4ggxPGdlnP diff --git a/prismatic/training/strategies/__pycache__/__init__.cpython-310.pyc b/prismatic/training/strategies/__pycache__/__init__.cpython-310.pyc index 5c96e2060d161fab616ba9b977eb1a500e649156..fe4624e162f2a61a46e83a34165ac8a094d333bd 100644 GIT binary patch delta 57 zcmZ3!mT*B)YD#8uNoq=bNq%-}UVLUs@vX@|vbv1ZCs)a;vq8+A zyj-@Eao%P_ISod}{>f4Do{VcIFOheY>I0bs@<5R|h>!#k0w7}N=Ku2gjEw6y>nq-2 LWZX5GM>z=qr*%gE delta 235 zcmezD_RftvpO=@50SMGDpWDbii%Zs9zbHSyL_e{#BtIobw!C$Oiz6$|iaM delta 91 zcmaE+`cRcSpO=@50SMGDpWDd&jYZZ`zbHSyL_e{#BtIobwX->{&d7)Bn##@u8%H{w`Cb?b!DrX(T delta 92 zcmbPY+i%OA&&$ij00ion&u!%1%pvQfUzDF;qMuk=lAn^JTasI#pI?xgSC*5go1a#) q`2mLnCqEyEmsXkslrBol%$xjOP-U~WP#!m9)Z|&RIY5$Kt``7sg&w^C diff --git a/prismatic/vla/datasets/__pycache__/__init__.cpython-310.pyc b/prismatic/vla/datasets/__pycache__/__init__.cpython-310.pyc index da9750c715c3a9ba19e0dc856460646be98145e1..4feabe19fe02051d1006b85c2a53d76c74f44fe6 100644 GIT binary patch delta 133 zcmdnRG@pq#pO=@50SLU-UC4->$ScbzGf`WTpOGPjDVRZ%`6VMzN|R+`xJ!VaChIMZ zl*E$6;?$DjA|{}C5i^Kj0THY~VkJWnJBSS;e!1vpw;a diff --git a/prismatic/vla/datasets/__pycache__/datasets.cpython-310.pyc b/prismatic/vla/datasets/__pycache__/datasets.cpython-310.pyc index 0eab0718bd47bb6fd8ce64c3813ec7f36f5939f6..e3677267f2ed5d7c2c0657d8a3eb1b2a964d510e 100644 GIT binary patch delta 2306 zcmaKtTWl0n7{~YQt=rwUF1EE7x@F6y?MN>oa#69d&=vtj2nEAhGfsD>yVLH@Y|qR{ zA%RBaB3>f(pfSY65Rq78Ok`g)8eZ^~=&KWBG(Py?gNctAgWvxQbW1~Qv%mS~obNlA z?>66!zxGUnRbL+u(BIxKe=M|aU1;cFTff@4g++6bLjk3=GRMvZnSCuh#wu63YD4UV zxK-n_j3~#Nh-YF?FNT z_3?GAS8R$mGDVEW*RzzEj;H6lpbrIJ4|Ec|P*FF%n3l__j>}Cip4T0<%&k({;cK8Y z0<=Oi$~7}*mAsm4!7{SC?S=hYOk0NOMI4J~3trUGP21u;4XY+#Elk5XBkS;u>!duR zAGf`zRxayi&I@HVhp(q`rNax`dU1+(!>U(wCe|DpgMNd58g zvw%E-O;V<=WX#D>+hsk=kBPI518vo-A45?t1Q-1)3ahtI+#3{+ zqMWm`s=CbeGDNa;rT?q=+>y2(wtQJ;5)*=_1)N(TqOrH{&!U*~NRr zs~sn)dED+;n_M|BHl;eZyiKY~#LvZ*3RtCj5P2meObrvKQ%5`H%*FfYAa_<*{z+|S z^Ft&e4XKW-r}1k%`9Zve@rCjtE|tM{{smN}ujKWg z$E*CJ9e^|f90CplM}Rbd_UR`s+Ln^=J&0QMB1TEe>ztBjn7#>yNln?6A0e*7=w(0y z?DqpHc8{9`PI_Vvb4?GF6p*5tZi0UmhH{)a}p>1 zkzGHRraY6eGB%-Q8eBk(AxQY45miHC28oGEAOwF*f`K&tfQLYcKhXBiK=dyrfjHmm zrcOc-iTruzoqO)N_nvd^z3FW3{G_qVfi-|s)lB58F(F(O9v zGl~(tq26HABZl*`l4z~WvFBBGyYjv^&iqm8SVzAz^ee85)6PZqig?}m{H(I*vFg^n z8;TJ!!fWbcLpfl&t-uXtv@ALK%T~aMoKT2S$yU&6Xi><@2di>p{;bio?zfcnI?HV- zi_Q&ogO%&$yC&bopIm*y@9JLt8NL4ZV0YY@`_=x%e-sZyi<4pw&cEYqV6$Wwfn1%gxjtdYSR#^GZ2rF~hMM zRo25B%30-!SUF_0TcN88H4mD~Izw|*u|iSB(4qn&Ex24iIXbd$Mn zF=7P`&4{h3RDxt2}mZFkZzbhE_s zyc0UleA=WX6>`faf0z_v(AysFB7p~i1F&%znUu-9#Z%sv<{nZcqPC-$&d%|JFu6y3 z&KvK!07HM(kv?0>WQ{bp{c|N6DXni!Q@rgB57?npiSu;cyrb5OCeP&O?dGbLo?0m7 zm$sCmv7v+6Vz{AaRZfqe$B++UFfUJI89F9s2a?&WE`y*iXRf8QMVl?z^_kp!Rmm@+ z+a=&`F8DEcaqD^teRZs*m?ebOFbgs_^JoFhrMj4h`87ZD+^hOY*{-=6HLDHGCI8*3 zhXpl1Y27S9bo|?`Q9G<09@eC}#6!Lll(H9mk4#i!Ia?@Xc{gT)P*zjqB+1q&4%`79 z2Ws<_5+=(#2<;;7Z_`xXA-W>L%0+(zV||q;_r1wnH()X;VvX}`L_F8{YxazIwCNGo zr=eaGH=8O`Uw|lpF9KfzzD$_*B}?-;q@=(pZ<7f>x6oDwJ_WP^?LZUo1P}tQh&P%a z{BRQDt}d(4Jp;%YJWJAaLoRboRzz_*o#LMrhl7LB+CsMylF4UGT`!B3;CBz9HNiMe z!-UG^IA4d)oxS*1NcWbw5L!I_3Q2ZjvZ&`vx#bm|_8(801umDd^o$=Yi71r*J+F47|`|KiVH2h ztDhvRsqsXBn=mf{R{;%J0gz|zBG_6cZ}Mw+{RV(#a#TyTP`(N2+rVo;ZGuulIQew~ z5>q@(a4Tw^D*n+j$gWqqBK^$u5==(KrRW0}Ws+XR`x+skRkK!8sQd~VTdR$=M0^d} z+Q@QHO!E)o-ROO+Pjt8bDzUz0lVCQxwn@lTyTICZk7kp(StEJ!Jw>#(O&pR1w~tyV z{zvJtsa6jfIp$5cl7k#4>0PnbcIIq9bc~R2aHOAa5Nsz;=I7J4v$&94PT$GAtrbdU zb?EQGP&z^s|0;s*$7XgW#HI`zg_N#u3%g8@+@v!0wRP7bzDe$WN57(*;8q;Wt+smk zCbp}%)jku#QOrAmE?_^HSURSb2aX2Ns)mULi+D>_KbCikg&E)BvEy;V6r_mc{ zKFuPl>_n2EmyUuvG)rpwziX{;4|FaJ(Wb!vfyx!H#Dkz0%Z0|Aj zixCn5J9O#%*~_PsW@=%I!cm+n@SJ{SWUpjT$y<4R_a!0^wLWrIIp_>)Lp%ZJ_)L%s zm-~Ys!s`Gq2n+!-$0OoBckCR;A+fENKD6^ymez67jWaU)2np8@?>TF6t3I~+zw72S z-0m(O)F2-RCV+DU+W*V+7^kQFl6epflfXxTyLWmbwyngb(DbN6wNtGk+0zGQ9FP?v zYf6?=Cv-CJ5lBBF*mbFeLMD|iR?BP}`n_uH02<^)*bV7PKnI=zdLSm+t5qmdBU3Sp zwiI9h7XVp>bC9qH`2vsu7J((;ZjIWmt#37E!36>tUkP-|mK(ILzM=zwesxVnfTQ@y9Fu`=4f`^6J8+ s!*LFBls!j}y}Z)NpDP%pZ2BbClTsWO?;Pm$X#p0{12cgZ_1+-+4-Yu*{Qv*} diff --git a/prismatic/vla/datasets/rlds/__pycache__/__init__.cpython-310.pyc b/prismatic/vla/datasets/rlds/__pycache__/__init__.cpython-310.pyc index 444a3ae766e193efbcec3f0c8b6bca01a1aeff04..a671810126924724fe7210caa9c535b2172844c2 100644 GIT binary patch delta 57 zcmey)_=k}@pO=@50SLU-U6{x{Ny$?`BR@A)zo;}bBQ-C*JTWg_zbq$FKg`Eb*D)or Lpd__u;x1bN54IDP delta 52 zcmeyv_??kEpO=@50SHVkpPR@%N!CTbC_leMKe4nVKP5-EB)336zaTZQEGJPnKdofq Gep>*!ZW2fU diff --git a/prismatic/vla/datasets/rlds/__pycache__/dataset.cpython-310.pyc b/prismatic/vla/datasets/rlds/__pycache__/dataset.cpython-310.pyc index b27339b585cb5ac0968b8282ef8b08faf1bd8a8e..8e27b6f4e180bb1b309e5baef10cf4062a2377bf 100644 GIT binary patch delta 1186 zcmZ8fNlX+$6rCENL55MHf(u3w6>LC91?A$x2yVm$0W|@&(@fVav8TJO>0t!7s00&Z zqVV0KiE&571)LBy(Syc=9yDs=rO%$c851wYX#HvtLmyth|LRrMuUFMKNG=W%pXvAK zdidAy@WZh0YjMfOK-7%W087TA)QF^1BNFJ))xgfp!9_t$wQS1Lljr)pq%}Qzo=OT@ zvkjV3%=Uzn!HJV$>KdzubMt?ZP0+VsEm;qr7s%2Ez5&|^nWdk7El9T@G{BeAwm=(F z%E2Mw*1T|1*A-2*)daQK9#peOt0FYEf>G8@Qt-X(ldl7X0|=e4d3iBIh$^bCI~i(wf^~CMChQ3M#OlkC^}vd1ZTCr}e+@^+ z8F*PuJf4J?lVXqTWnE}E$bqkR2!DA_9@S}_8n%+KXoxj%(UHP2oyW&a_)*=MFS?g; z;=dzQ&ML(kTj7g}w#3%hmQw`1RYmY@c?&rM6IXoqpLShhnX?dFJq700RFPiTQ4{p_ z;WB`5E`7b`m^4R3QR&Wz^PiL15mzm~$$2zg;L{Ds^~1>OH#0AwdIf!wnv5u6W5CNU zq1Fc%*PN&tYyH2Ca9Ff|3z>-XCNj}{Axvs0givis&K<7FUZ()QZ7F~owX)QYrn~8n zwX3`x~s;0Nyno z%O63fNKXXt1omzpDt?RNVh-{uCtFc8Go&c&DT@0M_QB$&pkKV9EF&4`NhIJ38BU`?abD~?9Oy|)>N|GiYSVp zPNhW=1evClwM8UUFG7LPgHaFjAJkLOOAt}_n^A)1;hXP#=evh<&$$Dm?Aj;`n8DyY zpZLl?eS8-9RjGE8Me;(arzYsOWimlqu*s-zlci!Xhjt5 z2T#+jiXL{A^0)p7Uh#DI;5EvxY+yv~mDPdkFx~*%%nVhgrIqeQOFg$Ke$Je(xN3=# z127Fz!>)wn`DtR;hb4nh@4_dksiR6XG2-X9pe{U(p_bsCDEpJY@iO}1~ zoQP@lICC{!x4`%TWYDjegeBg(##2zb0IiU-lBqafHLIhM~-ps zrh8+Scj+*~oT#DNww~bR^Ud{fEQMPX3YRtvBb|N3Jvjy$myj3p=d;`!)eD6G=Jo$v zXehjajnZ^Dz2Fs`?#|q;yr#&$@uDe+8wJQaog78c%!s1!35ZERoVK)92i>QVok+#S zBvO=aUC*L4*7}FNqp|&)w#Q&R2zUs13$Ot0yt1Q)soLI+#T~`20az%X$`l^xXMX_^ Cm@W+f diff --git a/prismatic/vla/datasets/rlds/__pycache__/obs_transforms.cpython-310.pyc b/prismatic/vla/datasets/rlds/__pycache__/obs_transforms.cpython-310.pyc index 01b6a495ec763ac8db30b032ef9f42da3ac7988c..5bab2b0b6a0414a2950a26a19503c692baf30ac0 100644 GIT binary patch delta 60 zcmeB_?UUus=jG*M00OUd7dCSHFe~}%XXNLm>KBz}W~An&mnY_>>zCyu>WBF_>N=(* O7L=qGZ7yLJtbwP?dpO=@50SLU-UD(K7%BB>cpOK%Ns$W!^nUR{8UY?kju3wgus2}FzsOy-L OSWuE$w0SPuIyL||E))R( delta 55 zcmca0by|u$pO=@50SF8)pWDb?$|mcrUzDF;qMuk=lAn^JTasI#pI?xgSC*5go1a#) Jc@f(>HUQp960raP diff --git a/prismatic/vla/datasets/rlds/oxe/__pycache__/__init__.cpython-310.pyc b/prismatic/vla/datasets/rlds/oxe/__pycache__/__init__.cpython-310.pyc index a2a9824f64fdaff18d44863b3a9d1969233384cd..647fe66497dd16fc8d3876708e0a403249a1074f 100644 GIT binary patch delta 57 zcmbQow3vxIpO=@50SLU-U6{zdS;<#FBR@A)zo;}bBQ-C*JTWg_zbq$FKg`Eb*D)or Lpd__u;%#RD?+6ny delta 52 zcmZ3?G>?fppO=@50SF8(pPR_NS=K|pC_leMKe4nVKP5-EB)336zaTZQEGJPnKdofq G17`r5>Ji`o diff --git a/prismatic/vla/datasets/rlds/oxe/__pycache__/configs.cpython-310.pyc b/prismatic/vla/datasets/rlds/oxe/__pycache__/configs.cpython-310.pyc index 47702563158f5075e53b9b58714965523b2e6924..20e3448b57c67cb1bb02b0aa43dd40acdc6122d4 100644 GIT binary patch delta 2242 zcmZ`(Z){Ul6z_d~ef_sTfGu?EW-W}iZW&#&=s*OSBCuK^YzkV``r1?5SGsm~8v{YQ zfjXHuH?GPDm&GjcgC?3_4WCH#izY@t>=Q9DQPik>&;@1tb2dZ(Jn}uz86jOlfOEN?V|5!9)pcVNB+S`!vk0jA^bg6$9K)dqgMAi4OC& zQhv3Tc{^Xi* zp`Q^y8BmotLttZUTSX;2Qq~x*#F-zhtjS)wEUh!{TO!nD+D3cDHEKs)gUA?yO}dFa zrc?g78o%y<4nK$)o|wPZA-zX%R!`DplhYZ2L*;%N_{5)5J&jLukE*?ahNO1dY_e2+q zZud$5u()s(HtXF5PSmt9f&JSsZS_glGmpYCcma;DWWtH0TDo-JUWAhgQ6UO}!h(c% zV-lv|6r5%eFTu<3N`Wm)A=*ncjaQ9dGWJ`#PI}&}a0X_;Q&R#v2>gh=24~^*8oc)E z+dB(ylnc+%9&>x+ybd|1L#PgUQ-{1&jRfDu>DlVcPdkkB{Aj#u>7a9V6Y0Z$iMu2Y z)?L!E20z``5XQ%6d}qN@Tx>Wkt`7%Ox#2`GKdz(_+2n+rO$Lu-4Z(TF>q%Zv}l6ge7~m3&5yMMuU*Q)AJ5Le4~r zap#vJMs~OM;&BfA*_*gWX z!%uExw0P5Vl8_U{o9i!j*JYCZz3p=;69Q;VCJHmG5M%lj# px6f_AR<_-$O@5{`{`dPr`l>jwi)HR$&D0mRRR?dP($%-%a+2{x@fh^ zwuOps4AwFV4Vl?nWG`^w+qQ=L$cc#N^XPL?Fp6|Qg zdE9gE-M82Nwx%f+3KOc&9wc**Kh z;FhB-QQYrOX#GBzGhK`f+CV7fI@~&Ak@7aF)LqIYcuor+Vqqhx!0pnj72As%Su582 zVw>BE>9FI(q0_P`&Yhl&cUiu&wm9kTmexHG?&DOOJLA1}mN{yyXelhicbUnSiz>=s z1sGCpxC&fp6&TT1UG>wuuG;KA5|o$T@2P0D?TBV^i@6qBJZnYua zsb3dwd{FjsRw&pAo19$+tqHuvsGpk)svok|Vw1KS9u~D?uNJ{IS{Z)A!#JXKlu@I5 zJhf~UHL@POq}5}a9>H2&I$W24dC+SO2>M{#)VZohlUA2J8}FAv1Fj}-FQlY#SkrTl zV<0~bwyRb84tpAOV%F^}6p1#Hk~!Ux2Bq>bPj%ic56SMF@Xf>5eC5=g(x_~2TDI3; zC;(pYnbaf0-DR?^H4@UaLC9)3cGw^IeO|m3|-P0Ofrwok37>rAgJ+K!h zq-A-yuibNWIcKh&eL*vHSG~f0@I1T#H3ds>pNsu)0A9R?&Yk!r$U2D+I#wyo509uN zp-J~LPO1w+Qy%7!hd~eX@@yvl3U$)BMlan3zG@$lGrU$m?AVJi8_?;m<_wQ&4(T?r zA^1Aea~}lvs0)_Ghf=9@yk}QB1q03L9fR@IU}EPJ13k^D-gGWGceIF|#~;G`a6D4Y z&gb5Ye54YQth|Z7k5qohnsQK8!dMgTsvc(X+|SkDag0SP@XMN&l%g%Vo3V86o!ES~ ziF3|4Q~xoFHKvO93LjT8+%TBIZ02uej9(^hZa$~}LR^K{pVXIVr4~!-!W#dY%oeO( zRN*83GXi(afcFV>3DI(Xip&iKBORlat@(NR8^k4NafgW;v@%YTIU>v(`Gk@opxauF zc6V7BqdNYSd})3R;(x-{MoMn*NWREr>q;1D=KKE>5{e%bXz?F5Dxd8_ zalg0M7UHN`Vekt!Y?NWXveC@~UH`F-Iz$*b5T?=qWZ7r!wWTrzPSkJ5pc(U1Kg7$2 zu+^*yWGzIP;}$a#_}9UH%3c+?;^45ru^$~A^;S-`e~a&#$Ji*(AlC&&$ij00ds^E^Oq!#I5A3pOK%Ns$W!^nUR{8UY?kju3wgus2}FzsOy-L OSWuE$wD}h|Ga~>%!4vfW delta 55 zcmZpY`YXwu&&$ij00jD%&u!$s#4YQgUzDF;qMuk=lAn^JTasI#pI?xgSC*5go1a#) J`9C)^BLL+<5~BbB diff --git a/prismatic/vla/datasets/rlds/oxe/__pycache__/transforms.cpython-310.pyc b/prismatic/vla/datasets/rlds/oxe/__pycache__/transforms.cpython-310.pyc index 742579a28a48b1e26a2fe12d4dcaa1b5c1ab17cf..248af71a25ac80a7fb5f659e0e2b5796ed503b17 100644 GIT binary patch delta 1906 zcmZXUOHh+n6o5T5HYBEqBT$}sK#fQ!1j-{MhLVr~2?A}$QihgVc|;z{>lefqq7_SR zsR*Y?DHIq$0S6jmR9dkEbkT(?7Hy|xVOi7#tFHQT?9hAeAA;0>@y&nkdERsHxmV@@ z=Rn@z@4sUk`|`e8cFTVVh<;sH-B9P$2`x3%&ibm>vid6BiDPBDxAQIWmddimW~bnJ z8t_C4@8DmdKY|S#gCcP)?6KU;VhDkREc_+>bI8G^Oa*pF3_>=#6GG7%nGJSseLZpv zthl69;A+|nSgS~Ze7-)fsDc9a7leA{N&h?+Dt5b|poms{pe)5`WiXD|Wu7!u3&24Z zJ_r}pk0cJVDDq51MFLB;6g>}zxK14?#etVs;IP+HhCO>`AQzXC!?0MB1C^pxsto5f zKf@6&xg0Wd?_JqdK`{*sYt$RTiGwlia1?vYQE1vX8e(CK)85*YWZ*}z@_ou<)1l6n zvMhE<@+K+Eai3PR%SpdG8EXDsUtbq$Fw+u(&Ds#w59-l`)&>o}vLtaP31t!@(5+XX zGfoSQo~v=Y0V;6(l~c?|5nltXUVABy$DfDOK5t9H9JKlBcuDuIiYMnM6tPi;z&9F1 zVlJ;>N1_?dF|SChN^FH=e62A_)|;e*3$*;A&`!qZ2^(6CW(K;SKWre_b+Ezr-s)dm?{MXf=#r-uxCw_5Z5|G(r*hFHmGWH%s2VC`Bq zRI+xhn_{rk7!O0wSFFH?MvYkYu-8k5yZ3j&2$$#*KiFRmwfyw-rYsosS@oOBS?njK zQW*2DN&SQ`DE}Zo3Yy5Fj0nk>v^2hHa+2gz+~nVkSn)t6_^g^=`_*xlwBl8zKi)5X zvwW?Z_7j>&B@m7i_+azlX(q9SZ~}X@fp{h}8*Z@FZERTG3E@~`*25&5z47go0v8fh z>|!=$g<+G$2vfYqMN6-W7eO;CZ1&ifU#MjsMKJ9xS%$x5b%gt-{Fx)YzF9|~)dn}k z7O>Zll>onB{K2XD_h1 zapfrZrbF>@ju~8Ha`r-5bDKd-w49p&xA}UYJsI2>Rj5R#T>~EegihMq*;2QZzyBv4rw`gx}!dtxP&O8&`6LYW)uIA;!9FJl<@RPiT%y=WkNE!JC_?omH z%$I@p{k~sdg>QI(VWV%}q6#f=-&0ZeofK9)dyC7U-A1u;2zEj~p@2|GI7lcW93mVh zR1%I5juPq!4TMHQE8#Sujc|@|fpC$~LFgo0CUk9MFP6|vzewmKTqE=o1_(ohVZsPu rlrTmZCrl7-5GDyzglWP}!VF=S;3C{6cnEh0_ZaYeNswotZUTFOUF2qClt!Z9MyZ_Gb9G;@S zDbh81dA+bsd`}L~Id%6pDEV<}V`Fn%c}sI+WqnNZzNWawrmBVm_2n^*)y*#Q{?n5l z@It^mSvg^=ANLn%`Io?-bUDI>>I<0p$Dl#VV;^5X?g<{DTpoxDV4L1b_7#h9{bjOn zp;F3Q4F1B0C*MQg=Y_ra!XXOsC*!*jGAOBy66ltaaLL=*F{Isep%;O^xss;S6^)tnxQeZ z)G9Rfd^M(%4oREb*ePn45rgAqLSz`(`QYV%sw~M5Ta+Hi-tk$~AuFnk@1T7$?}d0H z9mPLCtxz;WxfA#ZSXR+;dxBo`F@AMP`-udH$gSeFNjfI8N=iIT)$B~HqZ8PBsK=rE z2|HsIE=x+$e2Q(iG<78HrCy;lpgD}6{h|Urf1CV=yHTsjyHD%U>t;G7?J`renjTo3 z6kZKnm$F&YhwYjFlipvJo|O#>GPcua{Do0jnt?fE4h^zTszL)Cml_iGIUF${J1AQw z8~X2bx>F-Hr=d3_%briI6lJHS?V$7j%|hY1v~YKh7lpRM8`DqHC3g=Zv^<+$LGQ~W z$jC6!WvMZoQ6c;-W|Z-4ra^NKfre!dhch>6E?_%?xE^X7)*njIb6X?bog5WY^Td7x z5)7)kn!B8XmCD9ER2A|MfM!5mF*(pJ*d725vhyWh?##0C>1+c(%-SPbw>>40-_17C zRarz&u$Eof$Hm;H<%BLxQ%g=djj=Q53LW9MP0CV%eWpzMT4bm2V^deq%0+vo*;iIW z4w)@9E&}NJFLQ}MlpinP;;iJa?vY(USFE_0%QC)j@v2y+cc+{Fk*JbX% zTknRzWY_a;kVj+7YbJN#`Mg+iNiPAmM7k*tq1F~b6U(m? zJ8bRiZwe>;>o$KKo)lG1Q~?*;OK4J>I>ZFb*stlFkV02|legOfxi>$ZreyVx^7H7{ zip%_M^SU~y$y0@=05%n*(rswC`U`wWzN8A@Ei}^|8Dh%a}b1xx^wz!Y!`xDDI^rd{WD HKBa#EjwS3Q diff --git a/prismatic/vla/datasets/rlds/oxe/configs.py b/prismatic/vla/datasets/rlds/oxe/configs.py index 3222e02..1e550a2 100644 --- a/prismatic/vla/datasets/rlds/oxe/configs.py +++ b/prismatic/vla/datasets/rlds/oxe/configs.py @@ -177,6 +177,13 @@ class ActionEncoding(IntEnum): "state_encoding": StateEncoding.POS_EULER, "action_encoding": ActionEncoding.EEF_POS, }, + "calvin_abc_rlds": { + "image_obs_keys": {"primary": "rgb_static", "secondary": None, "wrist": "rgb_gripper"}, + "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None}, + "state_obs_keys": ["EEF_state", "gripper_state"], + "state_encoding": StateEncoding.POS_EULER, + "action_encoding": ActionEncoding.EEF_POS, + }, "columbia_cairlab_pusht_real": { "image_obs_keys": { "primary": "image", diff --git a/prismatic/vla/datasets/rlds/oxe/utils/__pycache__/droid_utils.cpython-310.pyc b/prismatic/vla/datasets/rlds/oxe/utils/__pycache__/droid_utils.cpython-310.pyc index 9b77c3045126e236cda91f09e5610545364b52a3..71a9e6398886bda247032efe04e49ae149c26c06 100644 GIT binary patch delta 60 zcmcbpdQFu(pO=@50SLU-UD(Jymr*H9KO;XkRlle-Gb1%Gy*x25UB4_RQ9sPbQP(jg Ov7jWiX!B7<9zg&^^Ap$r delta 55 zcmcbndQp`-pO=@50SNRjpWDbimr*uQzbHSyL_e{#BtIobw+GQ5{3W( diff --git a/prismatic/vla/datasets/rlds/utils/__pycache__/__init__.cpython-310.pyc b/prismatic/vla/datasets/rlds/utils/__pycache__/__init__.cpython-310.pyc index b0c94dfd4f5b24f4c664cc0ac4e1278af010e4e9..a2d7fdc958d8290f4f4418ba55ae44782c3ea1a5 100644 GIT binary patch delta 57 zcmZ3^xQ3BCpO=@50SLU-U6{yiqvWrjk)NBYUsRfzk(!rYo|u=eUzU@oALiqz>zI;Q LP?B0SF~u1G-^CJD delta 52 zcmZ3(xSWwYpO=@50SJsPpPR^SBkQGKl%HRspIBOwpOT|nl3SpkUyzztmXoNPpH?z4 G(-{Db?hyt6 diff --git a/prismatic/vla/datasets/rlds/utils/__pycache__/data_utils.cpython-310.pyc b/prismatic/vla/datasets/rlds/utils/__pycache__/data_utils.cpython-310.pyc index a07036cd3a6d61b1ef0bf40fb571b413bda25fb3..1d39a03bfde9ed7f3f8c2f5052bb081cab411a00 100644 GIT binary patch delta 59 zcmaDG_bHA$pO=@50SLU-UD(L&%B&QqpOK%Ns$W!^nUR{8UY?kju3wgus2}FzsOy-L NSWuE$v^bYp2LMNd6VLzv delta 54 zcmewq_co3@pO=@50SJsPpWDdo$}H=nUzDF;qMuk=lAn^JTasI#pI?xgSC*5go1a#) IxQJN?0PPhLfdBvi diff --git a/prismatic/vla/datasets/rlds/utils/__pycache__/goal_relabeling.cpython-310.pyc b/prismatic/vla/datasets/rlds/utils/__pycache__/goal_relabeling.cpython-310.pyc index d8e365f5277f45cccd629bf8129fcbba9865f506..1a7a1c6ae62d2cbc72bda9a82806c8f4deec5107 100644 GIT binary patch delta 60 zcmbQsxrmcHpO=@50SLU-UD(LIg-Iz)KO;XkRlle-Gb1%Gy*x25UB4_RQ9sPbQP(jg Ov7jWiX!9MWgNy(r^ArsL delta 55 zcmZ3)IhT_=pO=@50SJsPpWDd2g-JG0zbHSyL_e{#BtIobw`%^5pe(j diff --git a/run.sh b/run.sh new file mode 100644 index 0000000..ee70ede --- /dev/null +++ b/run.sh @@ -0,0 +1,35 @@ +# data_name=calvin_abc_rlds +data_name=libero_10_no_noops +export HF_HUB_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 +export HF_DATASETS_OFFLINE=1 +CUDA_VISIBLE_DEVICES=4,5,6,7 torchrun --standalone --nnodes 1 --nproc-per-node 4 vla-scripts/finetune.py \ + --vlm_path pretrained_models/prism-qwen25-extra-dinosiglip-224px-0_5b \ + --config_file_path pretrained_models/configs \ + --data_root_dir data/libero \ + --dataset_name $data_name \ + --run_root_dir outputs \ + --use_film False \ + --num_images_in_input 2 \ + --use_proprio True \ + --use_lora True \ + --use_fz False \ + --use_minivlm True \ + --image_aug True \ + --num_steps_before_decay 200000 \ + --max_steps 200005 \ + --save_freq 20000 \ + --save_latest_checkpoint_only False \ + --merge_lora_during_training True \ + --batch_size 8 \ + --grad_accumulation_steps 2 \ + --learning_rate 2e-4 \ + --lora_rank 64 \ + --use_pro_version True \ + --wandb_entity "my-wandb-org" \ + --wandb_project "$data_name" \ + --run_id_note VLA-Adapter--$data_name--$(date +%s) \ + # --resume True \ + # --resum_vla_path outputs/configs+libero_10_no_noops+b16+lr-0.0002+lora-r64+dropout-0.0--image_aug--VLA-Adapter--libero_10_no_noops----100000_chkpt \ + # --resume_step 100000 \ +# > logs/VLA-Adapter--$data_name--$current_time.log 2>&1 & \ No newline at end of file diff --git a/vla-scripts/finetune.py b/vla-scripts/finetune.py index 03263c1..bbb85e4 100644 --- a/vla-scripts/finetune.py +++ b/vla-scripts/finetune.py @@ -348,6 +348,9 @@ def run_forward_pass( ) # Get action masks needed for logging + #* batch["labels"] 是 L 个(L_a+L_lang),第一个是 BOS token,这样 :, 1: 是索引第 2-L 个。 + #* current_action_mask 索引了 L 中 L_a 里面前 6 个。 + #* next_action_mask 索引了 L 中 L_a 里后面 58 个(设定了 64 个 action tokens) ground_truth_token_ids = batch["labels"][:,1:].to(device_id) current_action_mask = get_current_action_mask(ground_truth_token_ids) next_actions_mask = get_next_actions_mask(ground_truth_token_ids) @@ -394,7 +397,12 @@ def run_forward_pass( else: # Get last layer hidden states multi_layer_hidden_states = [] - + #* 每一层 [B, 1 + L_v + (L_a + L_lang -1), Dim] 的 hidden_states + #* text_hidden_states 就是 [B, L_a + L_lang -1, Dim] + #* actions_hidden_states 就对应索引取出 action 是 True 的部分: [B, 1, L_a, Dim] + #* task_latten_states 是 vision 部分,也就是 [B, 1, L_v, Dim] + #* 这二者 cat 在一起,也就是 [B, 1, L_v + L_a, Dim] + #* 若一共 H 层,那我们最后在 维度 1 上 cat 即可得到 [B, H, L_v + L_a, Dim],这就是输入给 action head 的。H 是中间层数。 for item in output.hidden_states[0:]: # last_hidden_states = output.hidden_states[-1] # (B, seq_len, D) # Get hidden states for text portion of prompt+response (after the vision patches) @@ -719,6 +727,31 @@ def finetune(cfg: FinetuneConfig) -> None: # Create experiment run directory run_dir = cfg.run_root_dir / run_id os.makedirs(run_dir, exist_ok=True) + from omegaconf import OmegaConf + from dataclasses import asdict + import json + cfg_dict = cfg if isinstance(cfg, dict) else \ + OmegaConf.to_container(cfg) if OmegaConf.is_config(cfg) else \ + asdict(cfg) # dataclass + + # 2. Path 对象转字符串,保证可 JSON 序列化 + def _convert_path(obj): + if isinstance(obj, Path): + return str(obj) + if isinstance(obj, dict): + return {k: _convert_path(v) for k, v in obj.items()} + if isinstance(obj, (list, tuple)): + return [_convert_path(i) for i in obj] + return obj + + cfg_dict = _convert_path(cfg_dict) + + # 3. 写入 run_dir + config_json = run_dir / "config.json" + with config_json.open("w", encoding="utf-8") as f: + json.dump(cfg_dict, f, indent=2, ensure_ascii=False) + + print(f"Config saved to {config_json}") # GPU setup distributed_state = PartialState() @@ -775,7 +808,7 @@ def finetune(cfg: FinetuneConfig) -> None: processor = AutoProcessor.from_pretrained(cfg.config_file_path, trust_remote_code=True) if cfg.use_minivlm: - hf_token = '' + hf_token = '' if 'prism-qwen25-extra-dinosiglip-224px-0_5b' in cfg.vlm_path: vlm = load(cfg.vlm_path, hf_token=hf_token, load_for_training=True) diff --git a/vla_adapter.egg-info/PKG-INFO b/vla_adapter.egg-info/PKG-INFO index 7ad4e9b..5738a98 100644 --- a/vla_adapter.egg-info/PKG-INFO +++ b/vla_adapter.egg-info/PKG-INFO @@ -1,11 +1,11 @@ Metadata-Version: 2.4 -Name: openvla-oft +Name: vla-adapter Version: 0.0.1 -Summary: Fine-Tuning Vision-Language-Action Models: Optimizing Speed and Success -Author-email: Moo Jin Kim , Chelsea Finn , Percy Liang +Summary: VLA-Adapter: An Effective Paradigm for Tiny-Scale Vision-Language-Action Model +Author-email: Yihao Wang , Pengxiang Ding , Lingxiao Li License: MIT License - Copyright (c) 2025 Moo Jin Kim, Chelsea Finn, Percy Liang. + Copyright (c) 2025 Yihao Wang, Pengxiang Ding, Lingxiao Li, Can Cui, Zirui Ge, Xinyang Tong, Wenxuan Song, Han Zhao, Wei Zhao, Pengxu Hou, Siteng Huang, Yifan Tang, Wenhui Wang, Ru Zhang, Jianyi Liu, and Donglin Wang. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,10 +25,10 @@ License: MIT License OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -Project-URL: homepage, https://github.com/moojink/openvla-oft -Project-URL: repository, https://github.com/moojink/openvla-oft -Project-URL: documentation, https://github.com/moojink/openvla-oft -Keywords: vision-language-actions models,fine-tuning,robot learning +Project-URL: homepage, https://github.com/OpenHelix-Team/VLA-Adapter +Project-URL: repository, https://github.com/OpenHelix-Team/VLA-Adapter +Project-URL: documentation, https://github.com/OpenHelix-Team/VLA-Adapter +Keywords: vision-language-action models,tiny-scale backbone,fine-tuning,robotic learning Classifier: Development Status :: 3 - Alpha Classifier: Intended Audience :: Developers Classifier: Intended Audience :: Education @@ -66,7 +66,7 @@ Requires-Dist: tensorflow==2.15.0 Requires-Dist: tensorflow_datasets==4.9.3 Requires-Dist: tensorflow_graphics==2021.12.3 Requires-Dist: dlimp@ git+https://github.com/moojink/dlimp_openvla -Requires-Dist: diffusers +Requires-Dist: diffusers==0.30.3 Requires-Dist: imageio Requires-Dist: uvicorn Requires-Dist: fastapi @@ -82,100 +82,704 @@ Requires-Dist: boto3; extra == "sagemaker" Requires-Dist: sagemaker; extra == "sagemaker" Dynamic: license-file -# Fine-Tuning Vision-Language-Action Models: Optimizing Speed and Success +
+ +
-**Project website: https://openvla-oft.github.io/** -**Paper: https://arxiv.org/abs/2502.19645** +>### The official implementation of **VLA-Adapter**. If you want to discuss the VLA-Adapter, please feel free to join our **WeChat group**. The QR code is [here](https://github.com/OpenHelix-Team/VLA-Adapter/issues/1)! +
-**Summary video: https://youtu.be/T3Zkkr_NTSA** +
+

+ +

+
-## System Requirements +> **📝 Paper: https://arxiv.org/abs/2502.19645**
+> **🌍 Project page: https://vla-adapter.github.io/**
+> **🤗 HuggingFace: https://huggingface.co/VLA-Adapter**
+> **Github: https://github.com/OpenHelix-Team/VLA-Adapter** -Inference: -* 1 GPU with ~16 GB VRAM for LIBERO sim benchmark tasks -* 1 GPU with ~18 GB VRAM for ALOHA robot tasks +
-Training: -* Between 1-8 GPUs with 27-80 GB, depending on the desired training setup (with default bfloat16 data type). See [this FAQ on our project website](https://openvla-oft.github.io/#train-compute) for details. +## :loudspeaker: News! +- **[2025/09/22]** We released our codes! An enhanced **Pro** version is also released (this version conforms to the pipeline in the original paper, but is optimized in implementation). Everyone is welcome to use it!🎉 +- **[2025/09/13]** Our paper won the 🥇**first place** in the [daily list](https://huggingface.co/papers/date/2025-09-12) and the 🥈**second place** in the [weekly list](https://huggingface.co/papers/week/2025-W37) in HF! ⭐ +- **[2025/09/12]** We released the original version of the VLA-Adapter for four LIBERO models on [HuggingFace](https://huggingface.co/VLA-Adapter). +- **[2025/09/11]** We released our paper on [ArXiv](https://arxiv.org/abs/2509.09372). -## Quick Start +
-First, set up a conda environment (see instructions in [SETUP.md](SETUP.md)). +## :black_nib: TODO List -Then, run the Python script below to download a pretrained OpenVLA-OFT checkpoint and run inference to generate an action chunk: +- [x] Release **checkpoints** for reproduction. +- [ ] A more **powerful version**, **VLA-Adapter++**, and a detailed **technical report** 📝 will be released soon.
+- [ ] Continue to update the code to adapt to various **real-world systems** deployments, including the configuration of our paper, Franka, UR-5, and AGILE Piper.
+- [ ] It will soon be compatible with **various foundation models**, including but not limited to [VPP](https://arxiv.org/abs/2412.14803), [π0.5](https://arxiv.org/abs/2504.16054).
+- [ ] We will update the **diffusion transformers** and **flow matching** policy networks in the future, and the results will be updated in the subsequent VLA-Adapter++ technical report. +- [ ] We will also update and give more experiments on **Frozen backbone**. +- [ ] We will expand its **generalization** further in the future. Work is in progress! So please stay tuned! +- [ ] **RL post-training** is also in progress. Interested researchers are welcome to join us in building this foundation! +- [ ] **The dual-system compatibility** of VLA-Adapter is under exploration! -```python -import pickle -from experiments.robot.libero.run_libero_eval import GenerateConfig -from experiments.robot.openvla_utils import get_action_head, get_processor, get_proprio_projector, get_vla, get_vla_action -from prismatic.vla.constants import NUM_ACTIONS_CHUNK, PROPRIO_DIM -# Instantiate config (see class GenerateConfig in experiments/robot/libero/run_libero_eval.py for definitions) -cfg = GenerateConfig( - pretrained_checkpoint = "moojink/openvla-7b-oft-finetuned-libero-spatial", - use_l1_regression = True, - use_diffusion = False, - use_film = False, - num_images_in_input = 2, - use_proprio = True, - load_in_8bit = False, - load_in_4bit = False, - center_crop = True, - num_open_loop_steps = NUM_ACTIONS_CHUNK, - unnorm_key = "libero_spatial_no_noops", -) +
-# Load OpenVLA-OFT policy and inputs processor -vla = get_vla(cfg) -processor = get_processor(cfg) +## 🌟 Table of Contents -# Load MLP action head to generate continuous actions (via L1 regression) -action_head = get_action_head(cfg, llm_dim=vla.llm_dim) +- [:rocket: Quick Start](#rocket-quick-start) + - [Conda Environment of VLA-Adapter](#conda-environment-of-vla-adapter) + - [Install Dependencies](#install-dependencies) +- [:pencil: Data Preparation](#pencil-data-preparation) + - [LIBERO Benchmark](#libero-benchmark) + - [CALVIN Benchmark](#calvin-benchmark) + - [:video_game: Our Dependencies](#video_game-our-dependencies) + - [:pushpin: Benchmark Location](#pushpin-benchmark-location) +- [⚓ VLM backbone](#vlm) +- [:fire: Training for Different Configurations](#fire-training-for-different-configurations)   => Provides **training configurations** for GPUs ranging from **10GB** to **80GB** of VRAM. + - [:books: Related File for Training](#books-related-file-for-training) + - [:ledger: How to Train on Extremely Limited VRAM GPUs](#ledger-how-to-train-on-extremely-limited-vram-gpus)   => A card with 10GB-12GB *(e.g. NVIDIA GeForce RTX 2080Ti, 3060, 3080, 4070, 4080, and 5070)* + - [:ledger: How to Train on Low VRAM GPUs](#ledger-how-to-train-on-low-vram-gpus)   => A card with 24GB *(e.g. NVIDIA GeForce RTX 3090 and 4090)* + - [:ledger: How to Train on Larger VRAM GPUs](#ledger-how-to-train-on-larger-vram-gpus)   => A Consumer GPU with 32GB *(e.g. NVIDIA GeForce RTX 5090)*   A Professional-Grade GPU with 40GB-48GB *(e.g. NVIDIA A100-40GB, A800-40GB, L20, and RTX A6000).* + - [:ledger: How to Train on Sufficient VRAM GPUs](#ledger-how-to-train-on-sufficient-vram-gpus)   => Professional-Grade GPUs with ≥80GB *(e.g. NVIDIA A100-80GB, A800-80GB, H100, H800, H20-NVLink, and GB200).* +- [:mechanical_arm: Inference](#mechanical_arm-inference) + - [:books: Related File for Inference](#books-related-file-for-inference) + - [🤗 Checkpoint of VLA-Adapter](#ckpts) + - [:notebook: How to Eval](#evals) +- [🌈 Success Rate Comparison](#results) +- [📝 Citation](#cite) +- [:heart: Acknowledgment](#heart-acknowledgment) -# Load proprio projector to map proprio to language embedding space -proprio_projector = get_proprio_projector(cfg, llm_dim=vla.llm_dim, proprio_dim=PROPRIO_DIM) +
-# Load sample observation: -# observation (dict): { -# "full_image": primary third-person image, -# "wrist_image": wrist-mounted camera image, -# "state": robot proprioceptive state, -# "task_description": task description, -# } -with open("experiments/robot/libero/sample_libero_spatial_observation.pkl", "rb") as file: - observation = pickle.load(file) +## :rocket: Quick Start -# Generate robot action chunk (sequence of future actions) -actions = get_vla_action(cfg, vla, processor, observation, observation["task_description"], action_head, proprio_projector) -print("Generated action chunk:") -for act in actions: - print(act) + +### Conda Environment of VLA-Adapter + +```bash +# Create and activate conda environment +conda create -n vla-adapter python=3.10.16 -y +conda activate vla-adapter +``` + +### Install Dependencies + +```bash +# Install PyTorch +# Use a command specific to your machine: https://pytorch.org/get-started/locally/ +pip install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 + +# Clone vla-adapter repo and pip install to download dependencies +git clone https://github.com/OpenHelix-Team/VLA-Adapter.git +cd vla-adapter +pip install -e . + +pip install packaging ninja +ninja --version; echo $? # Verify Ninja --> should return exit code "0" + +# Install Flash Attention 2 for training (https://github.com/Dao-AILab/flash-attention) +pip install "flash-attn==2.5.5" --no-build-isolation +# If you run into difficulty, try `pip cache remove flash_attn` first, or visit the website to download it. (https://github.com/Dao-AILab/flash-attention/releases/tag/v2.5.5) +# You can download the corresponding `.whl` file according to the cuda version of `nvidia-smi`, and then run `pip install flash_attn-2.5.5+cuXX...whl` to install it. +# We use the `flash_attn-2.5.5+cu122torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl` file. +``` + +
+
+ + +## :pencil: Data Preparation + +### LIBERO Benchmark + +- **(Optional)** + +Clone and install the [LIBERO repo](https://github.com/Lifelong-Robot-Learning/LIBERO) and required packages: + +```bash +git clone https://github.com/Lifelong-Robot-Learning/LIBERO.git +pip install -e LIBERO +pip install -r experiments/robot/libero/libero_requirements.txt # From vla-adapter base dir +``` + +To download the [LIBERO datasets](https://huggingface.co/datasets/openvla/modified_libero_rlds) that we used in our fine-tuning experiments, run the command below. This will download the `Spatial`, `Object`, `Goal`, and `Long` datasets in `RLDS` format, i.e., `libero_spatial_no_noops`, `libero_object_no_noops`, `libero_goal_no_noops`, `libero_10_no_noops`. (`"_no_noops"` stands for no no-op actions, i.e., training samples with near-zero actions are filtered out). These datasets require `~10GB` of memory in total. If needed, see details on how to download the original non-RLDS datasets [here](https://github.com/openvla/openvla?tab=readme-ov-file#libero-setup). You can use these to fine-tune Prismatic-VLMs (built on Qwen2.5-0.5B) or other VLMs. + +```bash +git clone git@hf.co:datasets/openvla/modified_libero_rlds +``` + +When using LIBERO, you may get an error message like `AttributeError: 'NoneType' object has no attribute 'eglQueryString'`. You can use: + +```bash +sudo apt-get update +sudo apt-get install libgl1-mesa-dev libegl1-mesa-dev libgles2-mesa-dev libglew-dev +``` + +### CALVIN Benchmark + +- **(Optional)** + +```bash +git clone --recurse-submodules https://github.com/mees/calvin.git +export CALVIN_ROOT=$(pwd)/calvin +cd $CALVIN_ROOT + +# Installation of `pyhash` may fail on some machines. If it fails, you can solve it by lowering the `setuptools` version: `pip install setuptools==57.5.0` +sh install.sh +``` + +To download the [CALVIN ABC→D datasets](https://github.com/mees/calvin/tree/main/dataset) that we used in our fine-tuning experiments, run the command below. + +```bash +cd $CALVIN_ROOT/dataset +sh download_data.sh ABC +``` + +If you want to download the RLDS format, you can visit [here](https://huggingface.co/datasets/zhouhongyi/calvin_abc_rlds) to download it. This dataset require `~50GB` of memory. + +When using CALVIN, you may get an error message like `AttributeError: 'NoneType' object has no attribute 'eglQueryString'`. You can use: + +```bash +sudo apt-get update +sudo apt-get install libgl1-mesa-dev libegl1-mesa-dev libgles2-mesa-dev libglew-dev +``` + + +### :video_game: Our Dependencies + +- **(including LIBERO and CALVIN)** + +At this point, the environment is fully installed. If you want to confirm whether the environment is correct, you can see the `our_envs.txt` file we released. + + +### :pushpin: Benchmark Location + +The downloaded dataset can be placed in the `/data` folder. The overall directory structure is as follows: + +``` +· +├── data +· ├── libero + │ ├── libero_10_no_noops + │ │ └── 1.0.0 (It contains some json files and 32 tfrecord files) + │ ├── libero_goal_no_noops + │ │ └── 1.0.0 (It contains some json files and 16 tfrecord files) + │ ├── libero_object_no_noops + │ │ └── 1.0.0 (It contains some json files and 32 tfrecord files) + │ ├── libero_spatial_no_noops + │ │ └── 1.0.0 (It contains some json files and 16 tfrecord files) + │ + ├── calvin_abc + │ └── 1.0.0 (It contains some json files, 512 train tfrecord files, and 32 valid tfrecord files) + │ + └── other benchmarks ... +``` + +
+
+ +## ⚓ VLM backbone +We use the `Prismatic-VLMs` architecture. Since the file is large, please download it from [here](https://huggingface.co/Stanford-ILIAD/prism-qwen25-extra-dinosiglip-224px-0_5b). Then put it in the `/pretrained_models` folder. The file structure is: + +``` +· +├── pretrained_models +· ├── configs + └── prism-qwen25-extra-dinosiglip-224px-0_5b ``` -## Installation -See [SETUP.md](SETUP.md) for instructions on setting up the conda environment. +
+
-## Training and Evaluation +## :fire: Training for Different Configurations -See [LIBERO.md](LIBERO.md) for fine-tuning/evaluating on LIBERO simulation benchmark task suites. +**We provide different training configurations for different users. You can choose the configuration suitable for training based on your GPU card type.** -See [ALOHA.md](ALOHA.md) for fine-tuning/evaluating on real-world ALOHA robot tasks. +### :books: Related File for Training +* `vla-scripts/finetune.py`: VLA fine-tuning script -## Support -If you run into any issues, please open a new GitHub issue. If you do not receive a response within 2 business days, please email Moo Jin Kim (moojink@cs.stanford.edu) to bring the issue to his attention. +### :ledger: How to Train on Extremely Limited VRAM GPUs + +***=> Extremely Limited VRAM (A card with 10GB-12GB) (e.g. NVIDIA GeForce RTX 2080Ti, 3060, 3080, 4070, 4080, and 5070).*** + +>***About `batch_size`, `lora_rank`, `grad_accumulation_steps`, and `max_steps`.*** + +If your resources are extremely limited, you can set `--batch_size 1` and `--lora_rank 64`, it only requires `9.6GB` of VRAM. Certainly, `batch size = 1` will cause gradient updates to be greatly affected by extreme values, and loss convergence will be unstable. In this case, you can modify the `grad_accumulation_steps` parameter to simulate a similar effect. For example, `--batch_size 1` with `--grad_accumulation_steps 8` has a similar effect to `--batch_size 8`, but the training speed will be slower. This means that you can't use the [OpenVLA-OFT](https://github.com/moojink/openvla-oft) model on a card with `10GB` because even with `batch size = 1`, it requires `25GB` of VRAM. Fortunately, you can use VLA-Adapter. However, the `batch size` is still small, you can increase `--max_steps` to achieve the performance reported in the paper. + +>***About `vlm_path`.*** + +The VLM in the VLA-Adapter uses the Prismatic-VLMs architecture, with the LLM backbone being `Qwen2.5-0.5B`. You can download it from https://huggingface.co/Stanford-ILIAD/prism-qwen25-extra-dinosiglip-224px-0_5b and place it in `/pretrained_models/prism-qwen25-extra-dinosiglip-224px-0_5b`. + +>***About `data_name`.*** + +Launch the fine-tuning script with the vla-adapter configuration below. It can run in the background, and the running progress can be seen in the `/logs` folder. You can replace `libero_spatial_no_noops` with `libero_object_no_noops`, `libero_goal_no_noops`, or `libero_10_no_noops`. If you are using the `CALVIN` benchmark, you need to delete `\libero` in `--data_root_dir` and replace `libero_spatial_no_noops` with `calvin_abc`. + +>***About `use_pro_version`.*** + +In addition, we recently released an enhanced version `Pro` of the VLA-Adapter. While its framework remains consistent with the original paper, it has been enhanced in the implementation, resulting in significantly improved performance. **Therefore, we strongly recommend using the Pro version!** The `Pro` version's `Policy` size is `207MB`, and training speed is virtually unchanged. The `original version` is nearly `1GB` smaller than the `pro version`, requiring only `8.6GB` of VRAM. You can choose whether to use the `Pro` version by setting the `use_pro_version` parameter, i.e., the `Pro` version is `--use_pro_version True`. + + ```bash +data_name=libero_spatial_no_noops + +CUDA_VISIBLE_DEVICES=0 torchrun --standalone --nnodes 1 --nproc-per-node 1 vla-scripts/finetune.py \ +--vlm_path pretrained_models/prism-qwen25-extra-dinosiglip-224px-0_5b \ +--config_file_path pretrained_models/configs \ +--data_root_dir data/libero \ +--dataset_name $data_name \ +--run_root_dir outputs \ +--use_film False \ +--num_images_in_input 2 \ +--use_proprio True \ +--use_lora True \ +--use_fz False \ +--use_minivlm True \ +--image_aug True \ +--num_steps_before_decay 400000 \ +--max_steps 400005 \ +--save_freq 5000 \ +--save_latest_checkpoint_only False \ +--merge_lora_during_training True \ +--batch_size 1 \ +--grad_accumulation_steps 8 \ +--learning_rate 2e-4 \ +--lora_rank 64 \ +--use_pro_version True \ +--wandb_entity "YOUR_WANDB_ENTITY" \ +--wandb_project "$data_name" \ +--run_id_note VLA-Adapter--libero_spatial_no_noops--$current_time \ +> logs/VLA-Adapter--libero_spatial_no_noops--$current_time.log 2>&1 & +``` + +Please note that the obtained models will be stored in the `/outputs` folder. Each model will take up nearly `3GB` of memory, so you need to reserve enough space. We strongly recommend that you get our trained model from [VLA-Adapter HuggingFace](https://huggingface.co/VLA-Adapter) and place it in this folder for inference. + +
+ +### :ledger: How to Train on Low VRAM GPUs + +***=> Low VRAM (A card with 24GB) (e.g. NVIDIA GeForce RTX 3090 and 4090).*** + +>***About `batch_size`, `lora_rank`, `grad_accumulation_steps`, and `max_steps`.*** + +If you have such a device, you can increase the `batch size` and `lora rank`: `--batch_size 4` and `--lora_rank 64`. This only takes nearly `20GB`. This is consistent with the rank in our paper. This means that you can't use the [OpenVLA-OFT](https://github.com/moojink/openvla-oft) model on a card with `24GB` because even with `batch size = 1`, it requires `25GB` of VRAM. Fortunately, you can use VLA-Adapter. However, the `batch size` is still small, you can increase `--max_steps` to achieve the performance reported in the paper. + +>***About `vlm_path`.*** + +The VLM in the VLA-Adapter uses the Prismatic-VLMs architecture, with the LLM backbone being `Qwen2.5-0.5B`. You can download it from https://huggingface.co/Stanford-ILIAD/prism-qwen25-extra-dinosiglip-224px-0_5b and place it in `/pretrained_models/prism-qwen25-extra-dinosiglip-224px-0_5b`. + +>***About `data_name`.*** + +Launch the fine-tuning script with the vla-adapter configuration below. It can run in the background, and the running progress can be seen in the `/logs` folder. You can replace `libero_spatial_no_noops` with `libero_object_no_noops`, `libero_goal_no_noops`, or `libero_10_no_noops`. If you are using the `CALVIN` benchmark, you need to delete `\libero` in `--data_root_dir` and replace `libero_spatial_no_noops` with `calvin_abc`. + +>***About `use_pro_version`.*** + +In addition, we recently released an enhanced version `Pro` of the VLA-Adapter. While its framework remains consistent with the original paper, it has been enhanced in the implementation, resulting in significantly improved performance. **Therefore, we strongly recommend using the Pro version!** The `Pro` version's `Policy` size is `207MB`, and training speed is virtually unchanged. The `original version` is nearly `1GB` smaller than the `pro version` (1 batch), requiring only `17.6GB` of VRAM. You can choose whether to use the `Pro` version by setting the `use_pro_version` parameter, i.e., the `Pro` version is `--use_pro_version True`. + + + ```bash +data_name=libero_spatial_no_noops + +CUDA_VISIBLE_DEVICES=0 torchrun --standalone --nnodes 1 --nproc-per-node 1 vla-scripts/finetune.py \ +--vlm_path pretrained_models/prism-qwen25-extra-dinosiglip-224px-0_5b \ +--config_file_path pretrained_models/configs \ +--data_root_dir data/libero \ +--dataset_name $data_name \ +--run_root_dir outputs \ +--use_film False \ +--num_images_in_input 2 \ +--use_proprio True \ +--use_lora True \ +--use_fz False \ +--use_minivlm True \ +--image_aug True \ +--num_steps_before_decay 200000 \ +--max_steps 200005 \ +--save_freq 5000 \ +--save_latest_checkpoint_only False \ +--merge_lora_during_training True \ +--batch_size 4 \ +--grad_accumulation_steps 4 \ +--learning_rate 2e-4 \ +--lora_rank 64 \ +--use_pro_version True \ +--wandb_entity "YOUR_WANDB_ENTITY" \ +--wandb_project "$data_name" \ +--run_id_note VLA-Adapter--libero_spatial_no_noops--$current_time \ +> logs/VLA-Adapter--libero_spatial_no_noops--$current_time.log 2>&1 & +``` + +Please note that the obtained models will be stored in the `/outputs` folder. Each model will take up nearly `3GB` of memory, so you need to reserve enough space. We strongly recommend that you get our trained model from [VLA-Adapter HuggingFace](https://huggingface.co/VLA-Adapter) and place it in this folder for inference. + + + +
+ +### :ledger: How to Train on Larger VRAM GPUs + +***=> A Consumer GPU with 32GB (e.g. NVIDIA GeForce RTX 5090)
=> A Professional-Grade GPU with 40GB-48GB (e.g. NVIDIA A100-40GB, A800-40GB, L20, and RTX A6000).*** + + +>***About `batch_size`, `lora_rank`, `grad_accumulation_steps`, and `max_steps`.*** + +If you have such a device, you can increase the `batch size` and `lora rank`: `--batch_size 8` and `--lora_rank 64`. This only takes nearly `29GB`. + +>***About `vlm_path`.*** + +The VLM in the VLA-Adapter uses the Prismatic-VLMs architecture, with the LLM backbone being `Qwen2.5-0.5B`. You can download it from https://huggingface.co/Stanford-ILIAD/prism-qwen25-extra-dinosiglip-224px-0_5b and place it in `/pretrained_models/prism-qwen25-extra-dinosiglip-224px-0_5b`. + +>***About `data_name`.*** + +Launch the fine-tuning script with the vla-adapter configuration below. It can run in the background, and the running progress can be seen in the `/logs` folder. You can replace `libero_spatial_no_noops` with `libero_object_no_noops`, `libero_goal_no_noops`, or `libero_10_no_noops`. If you are using the `CALVIN` benchmark, you need to delete `\libero` in `--data_root_dir` and replace `libero_spatial_no_noops` with `calvin_abc`. + +With this configuration, you can achieve the same results as in our paper on the `LIBERO-Object` benchmark, achieving a `99.2%` success rate, in just `8 hours`. The `LIBERO-Spatial` benchmark requires approximately 10 hours of training. However, the `LIBERO-Long` benchmark takes longer because its tasks are longer and more difficult, requiring more training steps to achieve superior performance. + +>***About `use_pro_version`.*** + +In addition, we recently released an enhanced version `Pro` of the VLA-Adapter. While its framework remains consistent with the original paper, it has been enhanced in the implementation, resulting in significantly improved performance. **Therefore, we strongly recommend using the Pro version!** The `Pro` version's `Policy` size is `207MB`, and training speed is virtually unchanged. The `original version` is nearly `1GB` smaller than the `pro version` (1 batch). You can choose whether to use the `Pro` version by setting the `use_pro_version` parameter, i.e., the `Pro` version is `--use_pro_version True`. + + ```bash +data_name=libero_spatial_no_noops + +CUDA_VISIBLE_DEVICES=0 torchrun --standalone --nnodes 1 --nproc-per-node 1 vla-scripts/finetune.py \ +--vlm_path pretrained_models/prism-qwen25-extra-dinosiglip-224px-0_5b \ +--config_file_path pretrained_models/configs \ +--data_root_dir data/libero \ +--dataset_name $data_name \ +--run_root_dir outputs \ +--use_film False \ +--num_images_in_input 2 \ +--use_proprio True \ +--use_lora True \ +--use_fz False \ +--use_minivlm True \ +--image_aug True \ +--num_steps_before_decay 200000 \ +--max_steps 200005 \ +--save_freq 5000 \ +--save_latest_checkpoint_only False \ +--merge_lora_during_training True \ +--batch_size 8 \ +--grad_accumulation_steps 2 \ +--learning_rate 2e-4 \ +--lora_rank 64 \ +--use_pro_version True \ +--wandb_entity "YOUR_WANDB_ENTITY" \ +--wandb_project "$data_name" \ +--run_id_note VLA-Adapter--libero_spatial_no_noops--$current_time \ +> logs/VLA-Adapter--libero_spatial_no_noops--$current_time.log 2>&1 & +``` + +Please note that the obtained models will be stored in the `/outputs` folder. Each model will take up nearly `3GB` of memory, so you need to reserve enough space. We strongly recommend that you get our trained model from [VLA-Adapter HuggingFace](https://huggingface.co/VLA-Adapter) and place it in this folder for inference. -## Citation -If you use our code in your work, please cite [our paper](https://arxiv.org/abs/2502.19645): + +
+ +### :ledger: How to Train on Sufficient VRAM GPUs + +***=> Professional-Grade GPUs with ≥80GB (e.g. NVIDIA A100-80GB, A800-80GB, H100, H800, H20-NVLink, and GB200).*** + +>***About `batch_size`, `lora_rank`, `grad_accumulation_steps`, and `max_steps`.*** + +You can use 1 to 8 GPUs for training by changing the number of `CUDA_VISIBLE_DEVICES` to the GPU number and the number of GPUs after `--nproc-per-node`. In our paper, we use 4×H100 GPU for training. In this configuration, the four suites of the LIBERO benchmark, `Spatial` (only five hours), `Object` (less than one hour), `Goal` (three hours), and `Long` (half a day); the `CALVIN` benchmark (eight hours) + +>***About `vlm_path`.*** + +The VLM in the VLA-Adapter uses the Prismatic-VLMs architecture, with the LLM backbone being `Qwen2.5-0.5B`. You can download it from https://huggingface.co/Stanford-ILIAD/prism-qwen25-extra-dinosiglip-224px-0_5b and place it in `/pretrained_models/prism-qwen25-extra-dinosiglip-224px-0_5b`. + +>***About `data_name`.*** + +Launch the fine-tuning script with the vla-adapter configuration below. It can run in the background, and the running progress can be seen in the `/logs` folder. You can replace `libero_spatial_no_noops` with `libero_object_no_noops`, `libero_goal_no_noops`, or `libero_10_no_noops`. If you are using the `CALVIN` benchmark, you need to delete `\libero` in `--data_root_dir` and replace `libero_spatial_no_noops` with `calvin_abc`. + + +>***About `use_pro_version`.*** + +In addition, we recently released an enhanced version `Pro` of the VLA-Adapter. While its framework remains consistent with the original paper, it has been enhanced in the implementation, resulting in significantly improved performance. **Therefore, we strongly recommend using the Pro version!** The `Pro` version's `Policy` size is `207MB`, and training speed is virtually unchanged. The `original version` is nearly `1GB` smaller than the `pro version` (1 batch). You can choose whether to use the `Pro` version by setting the `use_pro_version` parameter, i.e., the `Pro` version is `--use_pro_version True`. + +```bash +data_name=libero_spatial_no_noops + +CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --standalone --nnodes 1 --nproc-per-node 4 vla-scripts/finetune.py \ +--vlm_path pretrained_models/prism-qwen25-extra-dinosiglip-224px-0_5b \ +--config_file_path pretrained_models/configs \ +--data_root_dir data/libero \ +--dataset_name $data_name \ +--run_root_dir outputs \ +--use_film False \ +--num_images_in_input 2 \ +--use_proprio True \ +--use_lora True \ +--use_fz False \ +--use_minivlm True \ +--image_aug True \ +--num_steps_before_decay 150000 \ +--max_steps 150005 \ +--save_freq 5000 \ +--save_latest_checkpoint_only False \ +--merge_lora_during_training True \ +--batch_size 16 \ +--grad_accumulation_steps 1 \ +--learning_rate 2e-4 \ +--lora_rank 64 \ +--use_pro_version True \ +--wandb_entity "YOUR_WANDB_ENTITY" \ +--wandb_project "$data_name" \ +--run_id_note VLA-Adapter--spatial--$current_time \ +> logs/VLA-Adapter--spatial--$current_time.log 2>&1 & +``` + +Please note that the obtained models will be stored in the `/outputs` folder. Each model will take up nearly `3GB` of memory, so you need to reserve enough space. We strongly recommend that you get our trained model from [VLA-Adapter HuggingFace](https://huggingface.co/VLA-Adapter) and place it in this folder for inference. + +## :mechanical_arm: Inference + +### :books: Related File for Inference +* `experiments/robot/libero/`: LIBERO eval files + * `run_libero_eval.py`: LIBERO eval script + * `libero_utils.py`: LIBERO eval utils +* `experiments/robot/`: General eval utils files + * `openvla_utils.py`: VLA-specific eval utils + * `robot_utils.py`: Other eval utils + +
+ +### 🤗 Checkpoint of VLA-Adapter +We fine-tuned `Qwen2.5-0.5B` with our adapter bridge paradigm on four LIBERO task suites independently: `LIBERO-Spatial`, `LIBERO-Object`, `LIBERO-Goal`, and `LIBERO-Long`. +The four VLA-Adapter checkpoints for LIBERO are available on Hugging Face: +* [VLA-Adapter/LIBERO-Spatial](https://huggingface.co/VLA-Adapter/LIBERO-Spatial) +* [VLA-Adapter/LIBERO-Object](https://huggingface.co/VLA-Adapter/LIBERO-Object) +* [VLA-Adapter/LIBERO-Goal](https://huggingface.co/VLA-Adapter/LIBERO-Goal) +* [VLA-Adapter/LIBERO-Long](https://huggingface.co/VLA-Adapter/LIBERO-Long) + +In addition, we also provide a `Pro` version, we used `4*H100` GPUs for training, `--batch_size 16`, `--lora rank 64`, and the `--max_steps 100000`. The Pro checkpoints is: + +* [VLA-Adapter/LIBERO-Spatial-Pro](https://huggingface.co/VLA-Adapter/LIBERO-Spatial-Pro) `(97.8 -> 99.6)` +* [VLA-Adapter/LIBERO-Object-Pro](https://huggingface.co/VLA-Adapter/LIBERO-Object-Pro) `(99.2 -> 99.6)` +* [VLA-Adapter/LIBERO-Goal-Pro](https://huggingface.co/VLA-Adapter/LIBERO-Goal-Pro) `(97.2 -> 98.2)` +* [VLA-Adapter/LIBERO-Long-Pro](https://huggingface.co/VLA-Adapter/LIBERO-Long-Pro) `(95.0 -> 96.4)` +* [VLA-Adapter/CALVIN-ABC-Pro](https://huggingface.co/VLA-Adapter/CALVIN-ABC-Pro) `(4.42 -> 4.50)` + +These files need to be placed in the `/output` folder. If you trained your own models, it will also be stored here. The subsequent eval code will call the model in this folder for inference. + + +
+ + +### :notebook: How to Eval + +**We strongly recommend that you use our open source `Pro` version of the model, which has stronger performance.** To start evaluations with one of these checkpoints, run one of the commands below. Each will automatically download the appropriate checkpoint listed above. If you want to use the original version of the model, you only need to adjust the `-- use_pro_version` parameter to `False` and pass the original version of the model to the `--pretrained_checkpoint` parameter. Finally, the inference results will be displayed in the `/eval_logs` folder, and the inference video will be displayed in the `/rollouts/vla-adapter` folder. + + +```bash +# Launch LIBERO-Spatial-Pro evals (Background running) +CUDA_VISIBLE_DEVICES=0 python experiments/robot/libero/run_libero_eval.py \ + --use_proprio True \ + --num_images_in_input 2 \ + --use_film False \ + --pretrained_checkpoint outputs/LIBERO-Spatial-Pro \ + --task_suite_name libero_spatial \ + --use_pro_version True \ + > eval_logs/Spatial--chkpt.log 2>&1 & + + +# Launch LIBERO-Object-Pro evals (Background running) +CUDA_VISIBLE_DEVICES=0 python experiments/robot/libero/run_libero_eval.py \ + --use_proprio True \ + --num_images_in_input 2 \ + --use_film False \ + --pretrained_checkpoint outputs/LIBERO-Object-Pro \ + --task_suite_name libero_object \ + --use_pro_version True \ + > eval_logs/Object--chkpt.log 2>&1 & + + +# Launch LIBERO-Goal-Pro evals (Background running) +CUDA_VISIBLE_DEVICES=0 python experiments/robot/libero/run_libero_eval.py \ + --use_proprio True \ + --num_images_in_input 2 \ + --use_film False \ + --pretrained_checkpoint outputs/LIBERO-Goal-Pro \ + --task_suite_name libero_goal \ + --use_pro_version True \ + > eval_logs/Goal--chkpt.log 2>&1 & + + +# Launch LIBERO-Long-Pro (LIBERO-10) evals (Background running) +CUDA_VISIBLE_DEVICES=0 python experiments/robot/libero/run_libero_eval.py \ + --use_proprio True \ + --num_images_in_input 2 \ + --use_film False \ + --pretrained_checkpoint outputs/LIBERO-long-Pro \ + --task_suite_name libero_10 \ + --use_pro_version True \ + > eval_logs/Long--chkpt.log 2>&1 & + + +# Launch CALVIN ABC→D-Pro evals (Background running) +CUDA_VISIBLE_DEVICES=0 python vla-scripts/evaluate_calvin.py \ + --pretrained_checkpoint outputs/CALVIN-ABC-Pro \ + > eval_logs/CALVIN--ABC.log 2>&1 & +``` + +The evaluation script will run 500 trials by default (10 tasks x 50 episodes each) in LIBERO and 1,000 task sequences in CALVIN. Use the same card for training and inference whenever possible. **Note that results may vary slightly if you use a different GPU than the H100.** + + +If you want to get the inference **throughput**, you can run it in the `run_libero_eval.py` file. You can add `start = time.time()` and `end = time.time()` before and after `lines 334--345` and calculate the difference between the two. This difference is the time it takes to generate `8 chunks`. This gives you the inference throughput. We measured it multiple times and took the average value of `0.036s`. + +
+ +## 🌈 Success Rate Comparison + +All our results are inferred on `H100`. You can find the inference `log` file in the model released on [HF](https://huggingface.co/VLA-Adapter) for viewing. + +### Performance on LIBERO benchmark. + +XX represents the best performance, XX represents the second best performance, and XX* represents the third best performance. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
LIBERO MethodsScale SpatialObject GoalLong Avg.
Large-scaleFlowVLA (Zhong et al., 2025)8.5B93.295.091.672.688.1
UnifiedVLA (Wang et al., 2025)8.5B95.498.8* 93.6 94.0 95.5
OpenVLA (Kim et al., 2024)7B84.788.479.253.776.5
OpenVLA-OFT (Kim et al., 2025)7B97.6*98.497.994.5*97.1*
UniVLA (Bu et al., 2025)7B96.5 96.8 95.6 92.0 95.2
CoT-VLA (Zhao et al., 2025)7B87.5 91.6 87.6 69.0 81.1
WorldVLA (Cen et al., 2025)7B87.6 96.2 83.4 60.0 81.8
TraceVLA (Zheng et al., 2025)7B84.6 85.2 75.1 54.1 74.8
MolmoAct (Lee et al., 2025)7B87.0 95.4 87.6 77.2 86.6
ThinkAct (Huang et al., 2025)7B88.3 91.4 87.1 70.9 84.4
Small-scale4D-VLA (Zhang et al., 2025)4B88.9 95.2 90.9 79.1 88.6
SpatialVLA (Qu et al., 2025)4B88.2 89.9 78.6 55.5 78.1
π0 (Black et al., 2024)3B96.898.8*95.8 85.2 94.2
π0-FAST (Pertsch et al., 2025)3B96.4 96.8 88.6 60.2 85.5
NORA (Hung et al., 2025)3B92.2 95.4 89.4 74.6 87.9
SmolVLA (Shukor et al., 2025)2.2B93.0 94.0 91.0 77.0 88.8
GR00T N1 (NVIDIA et al., 2025)2B94.4 97.6 93.0 90.6 93.9
Tiny-scaleSeer (Tian et al., 2025)0.57B- - - 78.7 78.7
VLA-OS (Gao et al., 2025)0.5B87.0 96.5 92.7 66.0 85.6
Diffusion Policy (Chi et al., 2023)-78.3 92.5 68.3 50.5 72.4
VLA-Adapter (Ours)0.5B97.899.297.2* 95.0 97.3
VLA-Adapter-Pro (Ours)0.5B99.699.6 98.296.498.5
+ +### Performance on CALVIN ABC→D benchmark. + +XX represents the best performance, XX represents the second best performance, and XX* represents the third best performance. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CALVIN MethodsScale 12 34 5 Avg. len
Large-scaleUniVLA (Bu et al., 2025) 7B 95.5 85.8 75.4 66.9 56.5 3.80
OpenVLA (Kim et al., 2024) 7B 91.3 77.8 62.0 52.1 43.5 3.27
OpenVLA-OFT (Kim et al., 2025) 7B 96.3 89.1 82.4 75.8 66.5 4.10
VLAS (Zhao et al., 2025b) 7B 87.2 64.2 40.9 28.1 19.6 2.40
LCB (Shentu et al., 2024) 7B 73.6 50.2 28.5 16.0 9.9 1.78
RoboDual (Bu et al., 2024a) 7B 94.4 82.7 72.1 62.4 54.4 3.66
OpenHelix (Cui et al., 2025) 7B 97.1* 91.4 82.8 72.6 64.1 4.08
ReconVLA (Song et al., 2025c) 7B 95.6 87.6 76.9 69.3 64.1 3.95
Small-scaleDeeR (Yue et al., 2024) 3B 86.2 70.1 51.8 41.5 30.4 2.82
RoboFlamingo (Li et al., 2024b) 3B 82.4 61.9 46.6 33.1 23.5 2.48
VPP (Hu et al., 2025) 1.5B 95.7 91.2 86.3* 81.0* 75.0* 4.33*
SuSIE (Black et al., 2024)1.3B 87.0 69.0 49.0 38.0 26.0 2.69
Tiny-scaleSeer-Large (Tian et al., 2025)0.57B 96.3 91.6* 86.1 80.3 74.0 4.28
MoDE (Reuss et al., 2025) 0.44B 96.2 88.9 81.1 71.8 63.5 4.01
Seer (Tian et al., 2025) 0.32B 94.4 87.2 79.9 72.2 64.3 3.98
VLA-Adapter (Ours)0.5B99.1 94.6 88.8 82.8 76.5 4.42
VLA-Adapter-Pro (Ours)0.5B98.595.0 90.585.380.04.50
+ + +
+ + +## 📝 Citation + +### 🫶 If you feel that this paper, models, or codes are helpful, please cite our paper, thanks for your support of VLA-Adapter! ```bibtex -@article{kim2025fine, - title={Fine-Tuning Vision-Language-Action Models: Optimizing Speed and Success}, - author={Kim, Moo Jin and Finn, Chelsea and Liang, Percy}, - journal={arXiv preprint arXiv:2502.19645}, +@article{wang2025vlaadapter, + author={Wang, Yihao and Ding, Pengxiang and Li, Lingxiao and Cui, Can and Ge, Zirui and Tong, Xinyang and Song, Wenxuan and Zhao, Han and Zhao, Wei and Hou, Pengxu and Huang, Siteng and Tang, Yifan and Wang, Wenhui and Zhang, Ru and Liu, Jianyi and Wang, Donglin}, + title={VLA-Adapter: An Effective Paradigm for Tiny-Scale Vision-Language-Action Model}, + journal={arXiv preprint arXiv:2509.09372}, year={2025} } ``` + +## :heart: Acknowledgment + +We thank [OpenVLA-OFT](https://github.com/moojink/openvla-oft), [MiniVLA](https://github.com/Stanford-ILIAD/openvla-mini), and [RoboDual](https://github.com/OpenDriveLab/RoboDual) for their open-sourced work! diff --git a/vla_adapter.egg-info/SOURCES.txt b/vla_adapter.egg-info/SOURCES.txt index 7ed48a4..6cdf1af 100644 --- a/vla_adapter.egg-info/SOURCES.txt +++ b/vla_adapter.egg-info/SOURCES.txt @@ -3,23 +3,12 @@ README.md pyproject.toml experiments/robot/openvla_utils.py experiments/robot/robot_utils.py -experiments/robot/aloha/aloha_utils.py -experiments/robot/aloha/constants.py -experiments/robot/aloha/preprocess_split_aloha_data.py -experiments/robot/aloha/real_env.py -experiments/robot/aloha/robot_utils.py -experiments/robot/aloha/run_aloha_eval.py -experiments/robot/bridge/bridgev2_utils.py -experiments/robot/bridge/run_bridgev2_eval.py -experiments/robot/bridge/widowx_env.py experiments/robot/libero/libero_utils.py experiments/robot/libero/regenerate_libero_dataset.py experiments/robot/libero/run_libero_eval.py -openvla_oft.egg-info/PKG-INFO -openvla_oft.egg-info/SOURCES.txt -openvla_oft.egg-info/dependency_links.txt -openvla_oft.egg-info/requires.txt -openvla_oft.egg-info/top_level.txt +pretrained_models/configs/configuration_prismatic.py +pretrained_models/configs/modeling_prismatic.py +pretrained_models/configs/processing_prismatic.py prismatic/__init__.py prismatic/py.typed prismatic/conf/__init__.py @@ -38,17 +27,20 @@ prismatic/models/load.py prismatic/models/materialize.py prismatic/models/projectors.py prismatic/models/registry.py +prismatic/models/transformer_utils.py prismatic/models/backbones/__init__.py prismatic/models/backbones/llm/__init__.py prismatic/models/backbones/llm/base_llm.py prismatic/models/backbones/llm/llama2.py prismatic/models/backbones/llm/mistral.py prismatic/models/backbones/llm/phi.py +prismatic/models/backbones/llm/qwen25.py prismatic/models/backbones/llm/prompting/__init__.py prismatic/models/backbones/llm/prompting/base_prompter.py prismatic/models/backbones/llm/prompting/llama2_chat_prompter.py prismatic/models/backbones/llm/prompting/mistral_instruct_prompter.py prismatic/models/backbones/llm/prompting/phi_prompter.py +prismatic/models/backbones/llm/prompting/qwen_prompter.py prismatic/models/backbones/llm/prompting/vicuna_v15_prompter.py prismatic/models/backbones/vision/__init__.py prismatic/models/backbones/vision/base_vision.py @@ -110,9 +102,17 @@ scripts/additional-datasets/lrv_instruct.py scripts/additional-datasets/lvis_instruct_4v.py scripts/extern/convert_prismatic_weights_to_hf.py scripts/extern/verify_prismatic.py +vla-scripts/calvin_env_wrapper.py vla-scripts/deploy.py +vla-scripts/evaluate_calvin.py vla-scripts/finetune.py vla-scripts/merge_lora_weights_and_save.py vla-scripts/train.py +vla-scripts/vla_evaluation.py vla-scripts/extern/convert_openvla_weights_to_hf.py -vla-scripts/extern/verify_openvla.py \ No newline at end of file +vla-scripts/extern/verify_openvla.py +vla_adapter.egg-info/PKG-INFO +vla_adapter.egg-info/SOURCES.txt +vla_adapter.egg-info/dependency_links.txt +vla_adapter.egg-info/requires.txt +vla_adapter.egg-info/top_level.txt \ No newline at end of file diff --git a/vla_adapter.egg-info/requires.txt b/vla_adapter.egg-info/requires.txt index 1dd95a1..839cc6c 100644 --- a/vla_adapter.egg-info/requires.txt +++ b/vla_adapter.egg-info/requires.txt @@ -20,7 +20,7 @@ tensorflow==2.15.0 tensorflow_datasets==4.9.3 tensorflow_graphics==2021.12.3 dlimp@ git+https://github.com/moojink/dlimp_openvla -diffusers +diffusers==0.30.3 imageio uvicorn fastapi diff --git a/vla_adapter.egg-info/top_level.txt b/vla_adapter.egg-info/top_level.txt index 16a23a4..6720ec3 100644 --- a/vla_adapter.egg-info/top_level.txt +++ b/vla_adapter.egg-info/top_level.txt @@ -1,4 +1,7 @@ +eval_logs experiments +figure +pretrained_models prismatic scripts vla-scripts From 26f7aca312a260543d6c8d42edfdf122fe620cb9 Mon Sep 17 00:00:00 2001 From: ruiheng123 Date: Thu, 9 Oct 2025 19:13:55 +0800 Subject: [PATCH 2/6] feature(wrh): add 3d branch --- prismatic/models/load.py | 2 - prismatic/models/pc_encoder.py | 450 ++++++++++ .../models/pi3/models/layers/attention.py | 16 +- prismatic/models/pi3_loader.py | 257 ++++++ .../__pycache__/__init__.cpython-310.pyc | Bin 302 -> 0 bytes .../__pycache__/base_strategy.cpython-310.pyc | Bin 9075 -> 0 bytes .../__pycache__/ddp.cpython-310.pyc | Bin 4838 -> 0 bytes .../__pycache__/fsdp.cpython-310.pyc | Bin 7828 -> 0 bytes .../__pycache__/__init__.cpython-310.pyc | Bin 287 -> 0 bytes .../__pycache__/datasets.cpython-310.pyc | Bin 9218 -> 0 bytes .../rlds/__pycache__/__init__.cpython-310.pyc | Bin 252 -> 0 bytes .../rlds/__pycache__/dataset.cpython-310.pyc | Bin 21911 -> 0 bytes .../obs_transforms.cpython-310.pyc | Bin 3726 -> 0 bytes .../traj_transforms.cpython-310.pyc | Bin 3408 -> 0 bytes .../oxe/__pycache__/__init__.cpython-310.pyc | Bin 291 -> 0 bytes .../oxe/__pycache__/configs.cpython-310.pyc | Bin 10550 -> 0 bytes .../__pycache__/materialize.cpython-310.pyc | Bin 4336 -> 0 bytes .../oxe/__pycache__/mixtures.cpython-310.pyc | Bin 3330 -> 0 bytes .../__pycache__/transforms.cpython-310.pyc | Bin 23443 -> 0 bytes .../__pycache__/droid_utils.cpython-310.pyc | Bin 4822 -> 0 bytes .../__pycache__/__init__.cpython-310.pyc | Bin 172 -> 0 bytes .../__pycache__/data_utils.cpython-310.pyc | Bin 12146 -> 0 bytes .../goal_relabeling.cpython-310.pyc | Bin 1186 -> 0 bytes .../task_augmentation.cpython-310.pyc | Bin 1702 -> 0 bytes vla-scripts/finetune.py | 6 +- vla_adapter.egg-info/PKG-INFO | 785 ------------------ vla_adapter.egg-info/SOURCES.txt | 118 --- vla_adapter.egg-info/dependency_links.txt | 1 - vla_adapter.egg-info/requires.txt | 38 - vla_adapter.egg-info/top_level.txt | 7 - 30 files changed, 724 insertions(+), 956 deletions(-) create mode 100644 prismatic/models/pc_encoder.py create mode 100644 prismatic/models/pi3_loader.py delete mode 100644 prismatic/training/strategies/__pycache__/__init__.cpython-310.pyc delete mode 100644 prismatic/training/strategies/__pycache__/base_strategy.cpython-310.pyc delete mode 100644 prismatic/training/strategies/__pycache__/ddp.cpython-310.pyc delete mode 100644 prismatic/training/strategies/__pycache__/fsdp.cpython-310.pyc delete mode 100644 prismatic/vla/datasets/__pycache__/__init__.cpython-310.pyc delete mode 100644 prismatic/vla/datasets/__pycache__/datasets.cpython-310.pyc delete mode 100644 prismatic/vla/datasets/rlds/__pycache__/__init__.cpython-310.pyc delete mode 100644 prismatic/vla/datasets/rlds/__pycache__/dataset.cpython-310.pyc delete mode 100644 prismatic/vla/datasets/rlds/__pycache__/obs_transforms.cpython-310.pyc delete mode 100644 prismatic/vla/datasets/rlds/__pycache__/traj_transforms.cpython-310.pyc delete mode 100644 prismatic/vla/datasets/rlds/oxe/__pycache__/__init__.cpython-310.pyc delete mode 100644 prismatic/vla/datasets/rlds/oxe/__pycache__/configs.cpython-310.pyc delete mode 100644 prismatic/vla/datasets/rlds/oxe/__pycache__/materialize.cpython-310.pyc delete mode 100644 prismatic/vla/datasets/rlds/oxe/__pycache__/mixtures.cpython-310.pyc delete mode 100644 prismatic/vla/datasets/rlds/oxe/__pycache__/transforms.cpython-310.pyc delete mode 100644 prismatic/vla/datasets/rlds/oxe/utils/__pycache__/droid_utils.cpython-310.pyc delete mode 100644 prismatic/vla/datasets/rlds/utils/__pycache__/__init__.cpython-310.pyc delete mode 100644 prismatic/vla/datasets/rlds/utils/__pycache__/data_utils.cpython-310.pyc delete mode 100644 prismatic/vla/datasets/rlds/utils/__pycache__/goal_relabeling.cpython-310.pyc delete mode 100644 prismatic/vla/datasets/rlds/utils/__pycache__/task_augmentation.cpython-310.pyc delete mode 100644 vla_adapter.egg-info/PKG-INFO delete mode 100644 vla_adapter.egg-info/SOURCES.txt delete mode 100644 vla_adapter.egg-info/dependency_links.txt delete mode 100644 vla_adapter.egg-info/requires.txt delete mode 100644 vla_adapter.egg-info/top_level.txt diff --git a/prismatic/models/load.py b/prismatic/models/load.py index a0eaa82..883adbb 100644 --- a/prismatic/models/load.py +++ b/prismatic/models/load.py @@ -20,8 +20,6 @@ from prismatic.overwatch import initialize_overwatch from prismatic.vla.action_tokenizer import ACTION_TOKENIZERS, ActionTokenizer -from prismatic.models.pi3.models.pi3 import Pi3 - # Initialize Overwatch =>> Wraps `logging.Logger` overwatch = initialize_overwatch(__name__) diff --git a/prismatic/models/pc_encoder.py b/prismatic/models/pc_encoder.py new file mode 100644 index 0000000..9b1040c --- /dev/null +++ b/prismatic/models/pc_encoder.py @@ -0,0 +1,450 @@ + +""" +pc_encoder.py + +Implementations of pointcloud encoder in iDP3, which also supports 2D point cloud maps. + +# reference: https://github.com/YanjieZe/Improved-3D-Diffusion-Policy/blob/main/Improved-3D-Diffusion-Policy/diffusion_policy_3d/model/vision_3d +""" +from typing import Tuple, List, Optional, Dict, Union, Type +from pathlib import Path + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from prismatic.overwatch import initialize_overwatch +overwatch = initialize_overwatch(__name__) +# ==================== Utility Functions ==================== + +def meanpool(x, dim=-1, keepdim=False): + out = x.mean(dim=dim, keepdim=keepdim) + return out + +def maxpool(x, dim=-1, keepdim=False): + out = x.max(dim=dim, keepdim=keepdim).values + return out + +def shuffle_point_torch(point_cloud): + B, N, C = point_cloud.shape + indices = torch.randperm(N) + return point_cloud[:, indices] + +def pad_point_torch(point_cloud, num_points): + B, N, C = point_cloud.shape + device = point_cloud.device + if num_points > N: + num_pad = num_points - N + pad_points = torch.zeros(B, num_pad, C).to(device) + point_cloud = torch.cat([point_cloud, pad_points], dim=1) + point_cloud = shuffle_point_torch(point_cloud) + return point_cloud + +def uniform_sampling_torch(point_cloud, num_points): + B, N, C = point_cloud.shape + if num_points == N: + return point_cloud + if num_points > N: + return pad_point_torch(point_cloud, num_points) + + # random sampling + indices = torch.randperm(N)[:num_points] + sampled_points = point_cloud[:, indices] + return sampled_points + +# ==================== 2D Point Cloud utility ==================== + +def shuffle_map_torch(map_data): + B, H, W, C = map_data.shape + # Flatten spatial dimensions + map_flat = map_data.view(B, H * W, C) + indices = torch.randperm(H * W) + map_shuffled = map_flat[:, indices, :] + # Reshape back + return map_shuffled.view(B, H, W, C) + + +def pad_map_torch(map_data, target_size): + B, H, W, C = map_data.shape + device = map_data.device + + if target_size > H or target_size > W: + # Create zero-padded map + padded_map = torch.zeros(B, target_size, target_size, C, device=device) + # Copy original data to top-left corner + padded_map[:, :H, :W, :] = map_data + # Optionally shuffle to distribute zeros randomly + return shuffle_map_torch(padded_map) + + return map_data + + +def resize_map_torch(map_data, target_size): + B, H, W, C = map_data.shape + + # Convert to [B, C, H, W] for F.interpolate + map_permuted = map_data.permute(0, 3, 1, 2) + + # Resize + if isinstance(target_size, int): + target_size = (target_size, target_size) + + map_resized = F.interpolate( + map_permuted, + size=target_size, + mode='bilinear', + align_corners=False + ) + + # Convert back to [B, H, W, C] + return map_resized.permute(0, 2, 3, 1) + +def crop_map_torch(map_data, target_size): + B, H, W, C = map_data.shape + + if isinstance(target_size, int): + target_h = target_w = target_size + else: + target_h, target_w = target_size + + if H < target_h or W < target_w: + # If smaller than target, pad first + return pad_map_torch(map_data, max(target_h, target_w)) + + # Random crop + top = torch.randint(0, H - target_h + 1, (1,)).item() + left = torch.randint(0, W - target_w + 1, (1,)).item() + + return map_data[:, top:top+target_h, left:left+target_w, :] + + +def uniform_sampling_map_torch(map_data, target_size, method='resize'): + """ + Unified sampling function for 2D maps + Args: + map_data: [B, H, W, 3] + target_size: int or tuple (target_H, target_W) + method: 'resize', 'crop', or 'pad' + Returns: + sampled map: [B, target_H, target_W, 3] + """ + B, H, W, C = map_data.shape + + if isinstance(target_size, int): + target_h = target_w = target_size + else: + target_h, target_w = target_size + + # If already at target size, return as is + if H == target_h and W == target_w: + return map_data + + if method == 'resize': + return resize_map_torch(map_data, target_size) + elif method == 'crop': + return crop_map_torch(map_data, target_size) + elif method == 'pad': + return pad_map_torch(map_data, target_size) + else: + raise ValueError(f"Unknown method: {method}. Use 'resize', 'crop', or 'pad'.") + +# ==================== 1D Point Cloud Encoder ==================== + +class MultiStagePointNetEncoder(nn.Module): + """1D Point Cloud Encoder using 1D convolutions""" + def __init__(self, h_dim=128, out_channels=128, num_layers=4, **kwargs): + super().__init__() + + self.h_dim = h_dim + self.out_channels = out_channels + self.num_layers = num_layers + + self.act = nn.LeakyReLU() + + self.conv_in = nn.Conv1d(3, h_dim, kernel_size=1) + self.layers, self.global_layers = nn.ModuleList(), nn.ModuleList() + for _ in range(self.num_layers): + self.layers.append(nn.Conv1d(h_dim, h_dim, kernel_size=1)) + self.global_layers.append(nn.Conv1d(h_dim * 2, h_dim, kernel_size=1)) + self.conv_out = nn.Conv1d(h_dim * self.num_layers, out_channels, kernel_size=1) + + def forward(self, x): + # x: [B, L, 3] --> [B, 3, L] + assert x.shape[-1] == 3, f"Input shape must have 3 channels at the last dim, got{x.shape}" + x = x.transpose(1, 2) + y = self.act(self.conv_in(x)) + feat_list = [] + for i in range(self.num_layers): + y = self.act(self.layers[i](y)) + y_global = y.max(-1, keepdim=True).values + y = torch.cat([y, y_global.expand_as(y)], dim=1) + y = self.act(self.global_layers[i](y)) + feat_list.append(y) + x = torch.cat(feat_list, dim=1) + x = self.conv_out(x) + + x_global = x.max(-1).values # [B, out_channels] + + return x_global + + +# ==================== 2D Point Cloud Map Encoder ==================== + +class MultiStageMapNetEncoder(nn.Module): + """2D Point Cloud Map Encoder using 2D convolutions""" + def __init__(self, h_dim=128, out_channels=128, num_layers=4, **kwargs): + super().__init__() + + self.h_dim = h_dim + self.out_channels = out_channels + self.num_layers = num_layers + self.act = nn.LeakyReLU() + self.conv_in = nn.Conv2d(3, h_dim, kernel_size=3, padding=1) + + self.layers = nn.ModuleList() + self.global_layers = nn.ModuleList() + + for _ in range(self.num_layers): + self.layers.append(nn.Conv2d(h_dim, h_dim, kernel_size=3, stride=1, padding=1)) + self.global_layers.append(nn.Conv2d(h_dim * 2, h_dim, kernel_size=1, stride=1, padding=0)) + + self.conv_out = nn.Conv2d(h_dim * self.num_layers, out_channels, kernel_size=1, stride=1, padding=0) + + def forward(self, x): + # x: [B, 3, H, W] + y = self.act(self.conv_in(x)) + feat_list = [] + + for i in range(self.num_layers): + y = self.act(self.layers[i](y)) + y_global = F.adaptive_max_pool2d(y, 1) # [B, h_dim, 1, 1] + y_global_expanded = y_global.expand_as(y) + y = torch.cat([y, y_global_expanded], dim=1) + y = self.act(self.global_layers[i](y)) + feat_list.append(y) + + x = torch.cat(feat_list, dim=1) + x = self.conv_out(x) + + x_global = F.adaptive_max_pool2d(x, 1) # [B, out_channels, 1, 1] + x_global = x_global.squeeze(-1).squeeze(-1) # [B, out_channels] + + return x_global + + +# ==================== Unified iDP3 Encoder ==================== + +class iDP3Encoder(nn.Module): + """ + Unified Point Cloud Encoder + + Supports 4 input formats: + 1. [B, L, 3] - Single 1D point cloud per batch + 2. [B, N, L, 3] - Multiple 1D point clouds (N views) + 3. [B, H, W, 3] - Single 2D point cloud map per batch + 4. [B, N, H, W, 3] - Multiple 2D point cloud maps (N views) + + Output: + - For single input: [B, out_channels] + - For multi-view input: [B, N, out_channels] (encode each view separately) + """ + def __init__(self, + out_channels=128, + num_points=4096, # Only used for 1D point cloud downsampling + target_map_size=224, # Target size for 2D map processing + h_dim=128, + num_layers=4, + point_downsample=True, # Only for 1D point clouds + map_sampling_method='resize', # 'resize', 'crop', or 'pad' for 2D maps + ): + super().__init__() + self.n_output_channels = out_channels + self.num_points = num_points + self.downsample = point_downsample + self.target_map_size = target_map_size + self.map_sampling_method = map_sampling_method + + # 1D Point Cloud Encoder + self.pointnet_encoder = MultiStagePointNetEncoder( + h_dim=h_dim, + out_channels=out_channels, + num_layers=num_layers + ) + + # 2D Map Encoder + self.mapnet_encoder = MultiStageMapNetEncoder( + h_dim=h_dim, + out_channels=out_channels, + num_layers=num_layers + ) + + overwatch.info(f"iDP3 Encoder has num layers: {num_layers}, h_dim: {h_dim}, output dim: {self.n_output_channels}") + + def _encode_1d_pointcloud(self, pc): + if self.downsample: + pc = uniform_sampling_torch(pc, self.num_points) + return self.pointnet_encoder(pc) + + def _encode_2d_map(self, map_data): + # [B, H, W, 3] -> [B, 3, H, W] + if H != self.target_map_size or W != self.target_map_size: + map_data = uniform_sampling_map_torch( + map_data, + self.target_map_size, + method=self.map_sampling_method + ) + map_data = map_data.permute(0, 3, 1, 2) + return self.mapnet_encoder(map_data) + + def forward(self, x: torch.Tensor, multi_scene: bool = False) -> torch.Tensor: + """ + Args: + x: Point cloud tensor in one of these formats: + - [B, L, 3]: Single 1D point cloud + - [B, N, L, 3]: Multiple 1D point clouds + - [B, H, W, 3]: Single 2D point cloud map + - [B, N, H, W, 3]: Multiple 2D point cloud maps + + Returns: + features: + - [B, out_channels] for single input + - [B, N, out_channels] for multi-view input + """ + assert x.shape[-1] == 3, f"Last dimension must be 3 (XYZ), got {x.shape[-1]}" + + ndim = len(x.shape) + + if ndim == 3: + # Case 1: [B, L, 3] - Single 1D point cloud + return self._encode_1d_pointcloud(x) + + elif ndim == 4: + B, dim1, dim2, C = x.shape + # Distinguish between [B, N, L, 3] and [B, H, W, 3] + if multi_scene: + # Case 2: [B, N, L, 3] - Multiple 1D point clouds + B, N, L, C = x.shape + x_reshaped = x.view(B * N, L, C) # Reshape to [B*N, L, 3] + features = self._encode_1d_pointcloud(x_reshaped) # [B*N, out_channels] + features = features.view(B, N, -1) # Reshape back to [B, N, out_channels] + return features + + else: + # Case 3: [B, H, W, 3] - Single 2D point cloud map + return self._encode_2d_map(x) + + elif ndim == 5: + # Case 4: [B, N, H, W, 3] - Multiple 2D point cloud maps + B, N, H, W, C = x.shape + x_reshaped = x.view(B * N, H, W, C) + features = self._encode_2d_map(x_reshaped) # [B*N, out_channels] + features = features.view(B, N, -1) # Reshape back to [B, N, out_channels] + return features + + else: + raise ValueError(f"Unsupported input shape: {x.shape}. Expected 3, 4, or 5 dimensions.") + @property + def output_shape(self) -> int: + return self.n_output_channels + + +# ==================== Main Test ==================== + +if __name__ == "__main__": + print("="*70) + print("Testing Unified iDP3Encoder") + print("="*70) + + # Initialize encoder + encoder = iDP3Encoder( + out_channels=256, + num_points=4096, + h_dim=128, + num_layers=4, + point_downsample=True + ) + + print(f"\nTotal parameters: {sum(p.numel() for p in encoder.parameters()):,}") + encoder.eval() + + # ========== Case 1: [B, L, 3] - Single 1D point cloud ========== + print("\n" + "="*70) + print("Case 1: [B, L, 3] - Single 1D point cloud") + print("="*70) + + B, L = 4, 8192 + pc_1d = torch.randn(B, L, 3) + print(f"Input shape: {pc_1d.shape}") + + with torch.no_grad(): + out_1d = encoder(pc_1d) + + print(f"Output shape: {out_1d.shape}") + print(f"Expected: [{B}, {encoder.output_shape}]") + assert out_1d.shape == (B, 256), f"Shape mismatch! Got {out_1d.shape}" + print("✓ Test passed!") + + # ========== Case 2: [B, N, L, 3] - Multiple 1D point clouds ========== + print("\n" + "="*70) + print("Case 2: [B, N, L, 3] - Multiple 1D point clouds") + print("="*70) + + B, N, L = 4, 3, 8192 + pc_multi_1d = torch.randn(B, N, L, 3) + print(f"Input shape: {pc_multi_1d.shape}") + print(f"N={N} views, each with {L} points") + + with torch.no_grad(): + out_multi_1d = encoder(pc_multi_1d, multi_scene=True) + + print(f"Output shape: {out_multi_1d.shape}") + print(f"Expected: [{B}, {N}, {encoder.output_shape}]") + assert out_multi_1d.shape == (B, N, 256), f"Shape mismatch! Got {out_multi_1d.shape}" + print("✓ Test passed! Each view encoded separately.") + + # ========== Case 3: [B, H, W, 3] - Single 2D point cloud map ========== + print("\n" + "="*70) + print("Case 3: [B, H, W, 3] - Single 2D point cloud map") + print("="*70) + + B, H, W = 4, 224, 224 + map_2d = torch.randn(B, H, W, 3) + print(f"Input shape: {map_2d.shape}") + + with torch.no_grad(): + out_2d = encoder(map_2d, multi_scene=False) + + print(f"Output shape: {out_2d.shape}") + print(f"Expected: [{B}, {encoder.output_shape}]") + assert out_2d.shape == (B, 256), f"Shape mismatch! Got {out_2d.shape}" + print("✓ Test passed!") + + # ========== Case 4: [B, N, H, W, 3] - Multiple 2D point cloud maps ========== + print("\n" + "="*70) + print("Case 4: [B, N, H, W, 3] - Multiple 2D point cloud maps") + print("="*70) + + B, N, H, W = 4, 5, 224, 224 + map_multi_2d = torch.randn(B, N, H, W, 3) + print(f"Input shape: {map_multi_2d.shape}") + print(f"N={N} views, each with {H}x{W} resolution") + + with torch.no_grad(): + out_multi_2d = encoder(map_multi_2d) + + print(f"Output shape: {out_multi_2d.shape}") + print(f"Expected: [{B}, {N}, {encoder.output_shape}]") + assert out_multi_2d.shape == (B, N, 256), f"Shape mismatch! Got {out_multi_2d.shape}" + print("✓ Test passed! Each map encoded separately.") + + # ========== Summary ========== + print("\n" + "="*70) + print("Summary of All Test Cases") + print("="*70) + print(f"Case 1: [B, L, 3] → [{B}, {encoder.output_shape}]") + print(f"Case 2: [B, N, L, 3] → [{B}, {N}, {encoder.output_shape}] ({N} views)") + print(f"Case 3: [B, H, W, 3] → [{B}, {encoder.output_shape}]") + print(f"Case 4: [B, N, H, W, 3] → [{B}, {N}, {encoder.output_shape}] ({N} maps)") + print("\n✨ All tests passed! Unified encoder works correctly for all cases.") + print("="*70) \ No newline at end of file diff --git a/prismatic/models/pi3/models/layers/attention.py b/prismatic/models/pi3/models/layers/attention.py index 728b27b..ca7702b 100644 --- a/prismatic/models/pi3/models/layers/attention.py +++ b/prismatic/models/pi3/models/layers/attention.py @@ -100,10 +100,14 @@ def forward(self, x: Tensor, attn_bias=None) -> Tensor: q, k, v = [qkv[:,:,i] for i in range(3)] if q.dtype == torch.bfloat16: - with nn.attention.sdpa_kernel(SDPBackend.FLASH_ATTENTION): + with torch.backends.cuda.sdp_kernel(enable_flash=True, + enable_mem_efficient=False, + enable_math=False): x = scaled_dot_product_attention(q, k, v) else: - with nn.attention.sdpa_kernel([SDPBackend.MATH, SDPBackend.EFFICIENT_ATTENTION]): + with torch.backends.cuda.sdp_kernel(enable_flash=False, + enable_mem_efficient=True, + enable_math=True): x = scaled_dot_product_attention(q, k, v) x = x.transpose(1, 2).reshape([B, N, C]) @@ -334,10 +338,14 @@ def forward(self, x: Tensor, attn_bias=None, xpos=None) -> Tensor: k = self.rope(k, xpos) if q.dtype == torch.bfloat16: - with nn.attention.sdpa_kernel(SDPBackend.FLASH_ATTENTION): + with torch.backends.cuda.sdp_kernel(enable_flash=True, + enable_mem_efficient=False, + enable_math=False): x = scaled_dot_product_attention(q, k, v) else: - with nn.attention.sdpa_kernel([SDPBackend.MATH, SDPBackend.EFFICIENT_ATTENTION]): + with torch.backends.cuda.sdp_kernel(enable_flash=False, + enable_mem_efficient=True, + enable_math=True): x = scaled_dot_product_attention(q, k, v) x = x.transpose(1, 2).reshape([B, N, C]) diff --git a/prismatic/models/pi3_loader.py b/prismatic/models/pi3_loader.py new file mode 100644 index 0000000..286577a --- /dev/null +++ b/prismatic/models/pi3_loader.py @@ -0,0 +1,257 @@ +""" +pi3_loader.py + +Implementations of pi3_loader, loading pi3 model which predicts pointclouds and camera extrinsics from images. +""" +from typing import Tuple, List, Optional, Dict, Union, Type +from pathlib import Path +from termcolor import cprint + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + + +from prismatic.models.pi3.models.pi3 import Pi3 +from prismatic.overwatch import initialize_overwatch + +overwatch = initialize_overwatch(__name__) + +def load_pc_model(pi3_path: Union[str, Path]) -> Pi3: + overwatch.info(f"Loading PC model from {pi3_path}") + if pi3_path is not None: + pc_model = Pi3.from_pretrained(Path(pi3_path) if isinstance(pi3_path, str) else pi3_path) + overwatch.info(f"PC model Loaded Successfully from loacal dir: {pi3_path}") + else: + raise ValueError("Please provide a valid path or repo id to a PC model") + + return pc_model + +# Pointcloud Encoder +def meanpool(x, dim=-1, keepdim=False): + out = x.mean(dim=dim, keepdim=keepdim) + return out + +def maxpool(x, dim=-1, keepdim=False): + out = x.max(dim=dim, keepdim=keepdim).values + return out + +class MultiStagePointNetEncoder(nn.Module): + def __init__(self, h_dim=128, out_channels=128, num_layers=4, **kwargs): + super().__init__() + + self.h_dim = h_dim + self.out_channels = out_channels + self.num_layers = num_layers + + self.act = nn.LeakyReLU(negative_slope=0.0, inplace=False) + + self.conv_in = nn.Conv1d(3, h_dim, kernel_size=1) + self.layers, self.global_layers = nn.ModuleList(), nn.ModuleList() + for i in range(self.num_layers): + self.layers.append(nn.Conv1d(h_dim, h_dim, kernel_size=1)) + self.global_layers.append(nn.Conv1d(h_dim * 2, h_dim, kernel_size=1)) + self.conv_out = nn.Conv1d(h_dim * self.num_layers, out_channels, kernel_size=1) + + def forward(self, x): + x = x.transpose(1, 2) # [B, N, 3] --> [B, 3, N] + y = self.act(self.conv_in(x)) + feat_list = [] + for i in range(self.num_layers): + y = self.act(self.layers[i](y)) + y_global = y.max(-1, keepdim=True).values + y = torch.cat([y, y_global.expand_as(y)], dim=1) + y = self.act(self.global_layers[i](y)) + feat_list.append(y) + x = torch.cat(feat_list, dim=1) + x = self.conv_out(x) + + x_global = x.max(-1).values + + return x_global + +def shuffle_point_numpy(point_cloud): + B, N, C = point_cloud.shape + indices = np.random.permutation(N) + return point_cloud[:, indices] + +def pad_point_numpy(point_cloud, num_points): + B, N, C = point_cloud.shape + if num_points > N: + num_pad = num_points - N + pad_points = np.zeros((B, num_pad, C)) + point_cloud = np.concatenate([point_cloud, pad_points], axis=1) + point_cloud = shuffle_point_numpy(point_cloud) + return point_cloud + +def uniform_sampling_numpy(point_cloud, num_points): + B, N, C = point_cloud.shape + # padd if num_points > N + if num_points > N: + return pad_point_numpy(point_cloud, num_points) + + # random sampling + indices = np.random.permutation(N)[:num_points] + sampled_points = point_cloud[:, indices] + return sampled_points + +def shuffle_point_torch(point_cloud): + B, N, C = point_cloud.shape + indices = torch.randperm(N) + return point_cloud[:, indices] + +def pad_point_torch(point_cloud, num_points): + B, N, C = point_cloud.shape + device = point_cloud.device + if num_points > N: + num_pad = num_points - N + pad_points = torch.zeros(B, num_pad, C).to(device) + point_cloud = torch.cat([point_cloud, pad_points], dim=1) + point_cloud = shuffle_point_torch(point_cloud) + return point_cloud + +def uniform_sampling_torch(point_cloud, num_points): + B, N, C = point_cloud.shape + device = point_cloud.device + # padd if num_points > N + if num_points == N: + return point_cloud + if num_points > N: + return pad_point_torch(point_cloud, num_points) + + # random sampling + indices = torch.randperm(N)[:num_points] + sampled_points = point_cloud[:, indices] + return sampled_points + +def create_mlp( + input_dim: int, + output_dim: int, + net_arch: List[int], + activation_fn: Type[nn.Module] = nn.ReLU, + squash_output: bool = False, +) -> List[nn.Module]: + """ + Create a multi layer perceptron (MLP), which is + a collection of fully-connected layers each followed by an activation function. + + :param input_dim: Dimension of the input vector + :param output_dim: + :param net_arch: Architecture of the neural net + It represents the number of units per layer. + The length of this list is the number of layers. + :param activation_fn: The activation function + to use after each layer. + :param squash_output: Whether to squash the output using a Tanh + activation function + :return: + """ + + if len(net_arch) > 0: + modules = [nn.Linear(input_dim, net_arch[0]), activation_fn()] + else: + modules = [] + + for idx in range(len(net_arch) - 1): + modules.append(nn.Linear(net_arch[idx], net_arch[idx + 1])) + modules.append(activation_fn()) + + if output_dim > 0: + last_layer_dim = net_arch[-1] if len(net_arch) > 0 else input_dim + modules.append(nn.Linear(last_layer_dim, output_dim)) + if squash_output: + modules.append(nn.Tanh()) + return modules + + +class iDP3Encoder(nn.Module): + """ + 修改后的 iDP3Encoder,只处理点云数据,删除了所有 state 相关的部分 + """ + def __init__(self, + observation_space: Dict, + pointcloud_encoder_cfg=None, + use_pc_color=False, + pointnet_type='multi_stage_pointnet', + point_downsample=True, + ): + super().__init__() + self.point_cloud_key = 'point_cloud' + self.n_output_channels = pointcloud_encoder_cfg.out_channels + + self.point_cloud_shape = observation_space[self.point_cloud_key] + self.num_points = pointcloud_encoder_cfg.num_points # 4096 + + print(f"[iDP3Encoder] point cloud shape: {self.point_cloud_shape}") + + self.use_pc_color = use_pc_color + self.pointnet_type = pointnet_type + + self.downsample = point_downsample + if self.downsample: + self.point_preprocess = uniform_sampling_torch + else: + self.point_preprocess = nn.Identity() + + if pointnet_type == "multi_stage_pointnet": + self.extractor = MultiStagePointNetEncoder( + out_channels=pointcloud_encoder_cfg.out_channels + ) + else: + raise NotImplementedError(f"pointnet_type: {pointnet_type}") + + print(f"[iDP3Encoder] output dim: {self.n_output_channels}") + + def forward(self, observations: Dict) -> torch.Tensor: + points = observations[self.point_cloud_key] + assert len(points.shape) == 3, f"point cloud shape: {points.shape}, length should be 3" + + # 下采样点云 + if self.downsample: + points = self.point_preprocess(points, self.num_points) + + # 提取点云特征 + pn_feat = self.extractor(points) # B * out_channels + + return pn_feat + + def output_shape(self): + return self.n_output_channels + + +class PointCloudEncoderConfig: + def __init__(self, out_channels=128, num_points=4096): + self.out_channels = out_channels + self.num_points = num_points + +if __name__ == "__main__": + + pc_model = load_pc_model("/home/ruihengwang/vla/VLA-Adapter/pretrained_models/pi3_checkpoint") + batch_size = 2 + out_channels = 128 + num_points = 4096 + observation_space = { + 'point_cloud': (num_points, 3), + } + pointcloud_encoder_cfg = PointCloudEncoderConfig( + out_channels=out_channels, + num_points=num_points + ) + encoder = iDP3Encoder( + observation_space=observation_space, + pointcloud_encoder_cfg=pointcloud_encoder_cfg, + point_downsample=True + ) + encoder.eval() + point_cloud = torch.randn(batch_size, num_points, 3) + + observations = { + 'point_cloud': point_cloud + } + with torch.no_grad(): + output = encoder(observations) + print(f"Input shape: {observations['point_cloud'].shape}") + print(f"\n输出特征形状: {output.shape}") + print(f"输出特征范围: [{output.min():.3f}, {output.max():.3f}]") + print(f"输出维度: {encoder.output_shape()}") diff --git a/prismatic/training/strategies/__pycache__/__init__.cpython-310.pyc b/prismatic/training/strategies/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index fe4624e162f2a61a46e83a34165ac8a094d333bd..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 302 zcmd1j<>g`k0KBz}W~An& zmnY_>>zCyu>WBF_>N=(*7L=qG=@%4b7Uw3GWG3qajRrePA8JWvYO#KNJW!}4K3=b& b@)n0pZhlH>PO2Tq7sYHqf`@^JQGgKuAWBbi diff --git a/prismatic/training/strategies/__pycache__/base_strategy.cpython-310.pyc b/prismatic/training/strategies/__pycache__/base_strategy.cpython-310.pyc deleted file mode 100644 index f4b0991d71e854a66887f4d3b7079722b14d2b1f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 9075 zcmb_iS#%rMd7cfxU?m8GJ1K#(EDN?oO5V4)+ALePL@AcMWD<8Y#2u0VX8`EVfEIa% ziDTJy(nht@p40822YhtWHc#z2?Ni&6zVxZj_q9(tr%Bs5O}!=R_ul~^P*$3kKxcm5 z<-hm-bC>`7{yS>r^BD!dm;UD8)~}sclz*qp@XtWz1fJlCswhmc6s9uGQ+>_SeBIJj z%IlutC#-}_8(z{+St;MNOqox3X)6PrgqO9l@^8-1TLoE{^hW%mRrE)#QNLuB{4r}x zmZ!Y3U$)9JZF=MWgf$`4X>Zb>vZiD@<4yZB){IPNy;* z`~_>lKVTh@<$3Qu|DbhHrVHNv{vqp-OpkaE_=l~-e$}eVe9?Q*f5>`BrboRa{!#0w zzi2J`OV*Nq%sPhhl69PoSr5B2?jtu;>(Lh!R%YYxDs0@PB=7WnH48+ z?I7e%=&o%owzo2wlPi?1g_WA;1VM$ltMz6*thbt#)@sG69AP34CY@>V@uudH@Fui`YDt#no8Rq^VpA>ME?bBi`!RTKKj zQ>R72=}lAoK2=p1mz;1N)#ho(^PCmW73Re@y_n-IR}GOlBaxQc^weKlW*WoqhmJ=R-+zvYJ zc8k*xTYE;$~Oq73$kAr`0%l`pUVB%l4Ub7sNy_edXfI&oA4jo`2!ovVHE%8)9_% z)eCl9ciBGu;;YLqi`=;vmM^~YynSx@%=2&D%40v=(jaQIGlqW;Aaepw@Hozxt5_;r zlLq$!=j7@QgXyaYX1EEKaFZ88(Wi zgl7yrWmy@ma&B@BZf;eZR?vPNx)aD9Vv}s@-Gr5Q3u!&Avd7u9JHlqr^DLe@v`a&K z9zBtEj_qUn-%Y?>6y1Wu;yJKXQIsd2p}=G`WpUfHBh$3t%XcB|ebHzM};s{Q(Pm)ULZ*6IO{oZa`N zVhX#vL+{qK$&21m5#x8aVRHDwTP?l;Tj$y^4xY-C31 zDAVgRhmt&=LX?eiQfCCYVnk)5Q30|PQOQ_Tge*s+kmF4K28}v_XA+~L*G6_iVcLck zy!n#C^c(6surn^7QNpS2^sX{P*3Y1{#3RUZINO+meh2#4x9_Oo{0$}eCUV%{@8B$6 z235cgODF%{p@{@va(DTG&2z zk?K#KJ^pw_E>UH$takts7&GvB3)`nposAt~1?51=J9QNRtwf}Chcj#}_nAtk39P7G zy=(QaR`>A*n9dK7xDP^%0C{Y`MO)L}aJK?6E_apOj>E+{&+~CCY8xxSK&jLWRdb7eK2Q$CLFt7F0lTyOQR zIr{C`&L74IA1V9<{(N?6X}#sUOT1HGcbjW(0e_Y@J!c63d-No8+CcNAb|0OW`p0~! zN0`84mhR?Y!nEVVY}1SdG>a4C6M9Z9=mm8`E2v35smgzPPA?gnYTTaCl4@BksK&1h zZF~Akk1X~5=3>7;k>Z`E&FWlad(M$&P$Z=D6d7q;4E%)_wsh>2X#3QXFrxpXG##4L zcs?eVLZ|5?by-eFHOYSmMVu=684}NvI7{LM5+?@YBvN7|07Dz{k*g>S2viMoyYi(M zxl36Tdydm~uS@zA)O0lM|0vxzl)s1!KS$yOL{%R$lU(B6cBN^iaPGiUQ2Vj8NZp?K zl0^fx8jaNdQTo@7ti~gsfJ!{*Nu)%f=GEJGa3-MI3lQ7;zcl1tO<{td!_KDrSs@@I zbreTkGO1fB>Nv%EWYetmh?Gcyv`Q8@8-rjClrQ8RNvQ_G039Ef)u@mDk02zAht@Xu zF{Xs6k+G@p`6xjS8h_QUva9T*LKWbWf-BZSy=&|$kcnL-N-~`pH&Z)ivk)dDb-0ct zWNq?ha>oo)Xst9%fQiPamozg`o(?ln`UcfxXOtZRh8#81GI{SvW~HLGuUS&~^$tBjHqIngRG41V^(he^dWpztjiRvBI1G~zuf!R9Tt%W$9WS1fX)YcaE;gF!gHxFkO9yVpKdq;3BH!ZqV_(uZKDo+LLoz+!P zPSWCEBe5Uiql7>RKj62ipch1DV)I<$gIO9Y=7@nr-sM}2ni>MY89JUYm>bkY0wj3T zg}bN+oDgdO(bHLnTmNU6X|*rp(3bvfFL+ToGp=927kFWNUdjYI$p;OL=sW3 z>jk0&xCYRZQv$3Za&rT`T7zC;gM>%IC((rX2;nYG<}GrclA9rdD|XtLeY&V{+cx@g ztD3|Xp?SP|#41zJ+4p+32NVw113p0=93t^FiDyXAT=Fyt@@p}3kz_fJp`QrK?4C~e3^<3dUyU5lJT2o$vE04w#S9h z?gZ=nC8{LfER1^S`dC-oE{N2M!+9MfD;ZBB5Em_J$$4GgPgKWZKzfCmUnX&d#C;@Q zB|%#b_h3C#8O-mSC>9wy6`);otuZ3$m`yQOOem7^KT22;r&~=fJ zn^z=T@=_Fp@CmB@seM>2jR59*L6SB9_VZ_MrAWW+#J(*ii5x zWQfo&RBvcO8X=L|3#c?1VwL(KR;Wu1HkbhnPQVK%7-2Jlzitw~pD-E-&5RCcJyL|t zsT*n!Re{_r6=u6RiK^yJ<3sg>cXray%M&t7JRBj+MJja$Uir<9olH2|C^6C*!&7D% zAm^Oa%tslP1x{vIPRhKLz*07H(*O!(8{<1U_83@ed7$QwJO;eymE^|VS-YiyuJd1I$6Z7c%O$0FR;vWc6xv5w~u*mo*C z09z)f6^u5HxgNiPtE3IgD?b&DgMGa5T_qgt-uJ$;s3cpro8f*Aa<4E^tG!;$nYCAJ{ zr^W9m-KAY6nv7=To6XRBUAqM=KQ`D2$9t)VyZuyucNmRFm^P{4jj>O$PLIwY*x#9z zbJeShW>Nld_c1V(1lDTyCRh`8>EpCZQA2xF{ZMVajW&-^ej{NhJ9A(H-;*T?@T7#~ zNk2sG6DWI9md}EZJO#Ux;DvK?EuQW^Bke?6{zJ)M20M-!;4kU!v$8(j{Yqp?{_^7R zt|T4FK>lfXXHsc(e!J0UAcz`&3oIn|7nL3X*+WJaw-4_b!@b^iYalF6vvSwP^8&vH zkFove9(6`Ho$Ip z>wpL7tWNjf2C)tBBV2gIG0WgyI@Q0p9^Z2ZJ*Y45DeftAj&Vf3=WXq#;{zOte)NAb z_(O)`pP&q}28R7+atBe)ca#n7``R__@=ySYt7~oXm@r(=^+9Je)`KvKUYTb-pfX*n zKevG~WT;zKwjaAECJ+epk#6x>4}8H3o7{~vN#GjM+3kXEIcfxQA zE32vu#H-JV46Ys#`8my+%LyT30<@PI@eWlG{kg-8ct}~vstAAME0*h&BG$vVNf625 z5s70Y2KS3hK|!haJt;& zqaE#WsOmw-7YTVi#Q%hpXK}X+po|ffxGYlrj~_y> zwL2mKy4KzjTC*tzpQs#9&aG97DVmJ{VSv#vje3-1jem=*8gR9kguYE>-zM=A3F2w| z9TGQ4yi4Le62AvgosTKmhg2Xb+4m_WdD$OP>W@jhPvW~IexJl2koX=6Nu$Q-)$y4G zEGqtZLyUc5jkh{Yl<-ctK2$z-k8e40Gg*b+5$hkU(LE=K4&t4yCd}Su46GEUlq#*# zT}RYrfi**>*|>GUjd-Wk3HmcnbyN2!UISkZtkJ!eph(h5kp>C!>rGaN;up%d%v4U*-aq)W?St$m1%c8dc;97sfk70? zzlsu6e1Jb!s&jPRNIuWDMaJ%5AKM~l+t)gd7uUqc`8hP;^wmq~pq@BYA?Had`ztCl zR$48OuTwrj;uHxYo!lTHcl~)vQ9Q#<5>q5bNc4PRIHBF*O;_peE*ehspc?E*J zO7O1`rlO6|Jkoj^KLv~b6;}6`nxSDGwa=4D?K3T4sKZ_;pr`nkI6Rn*25A6vpi9SCY?lXx#QHnlupK9qTRnxF0 za`lZ*hxKUeSdYfee4>0yE1|FdN2l8uf8HD z75;9kjVLIdaJdvef+oCHJB~6(<<*5Yv0Nb;E6a6} z5%4_FsQ*DASd_5_eKw23w}9tp@bSTIcdQUB;*(tv?;AL!-goN7!6iR0pcVOqe$4>! zdi*8cA+#IBzI$j%bz52=!+`{bF{n#munKqDkdpj5V9oC>T$FJry-M8A1PwNQmkv!$ zuU>9a+P%XT!P6}EeAFOPlRH^V-sy>ycn0leV+L&Rs|n`o`b7X0y-cDV&j}zR87rPbLtcx{KJkd QQaVY0QL4)bS1%j?2T~r$f&c&j diff --git a/prismatic/training/strategies/__pycache__/ddp.cpython-310.pyc b/prismatic/training/strategies/__pycache__/ddp.cpython-310.pyc deleted file mode 100644 index fc42075f4cd6506cd656537f1aae58db29bac118..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4838 zcmaJ_NpBp-74EKHW~OK1aFr!lQppY+jU^5Z$4d}Pgk@8fVMr8X%91gF(`t4VhfVgT z>K;-Y3=<#}5MLZ1$k7o(9(@V&LvqNuatsi__iUDJDeu)R94fX4=-TV`>sRl6@2g3w zRY!y02Y>qO#@|kB+TXD={xhNTHayvjUsM@MVwN_2_i{V66Z`D=16i!A{ttnmObw0hXw`Q(u z%wgq+8Y}aueWNwY$g0+;bpPQnmNwH}$9Xpqocf`cWt8#tAP({%iRpSGs7JG0csXzH z(jZDh9`QI2;x@gV2!DgdULI_7x)i{oV6BsLMwh(Yqbr{9!jOmbw(vk&+c z3I%nPNV*e*A-#3$<|Vqb5r8s528~_1!)cP{O8NIzmsXn2mtb{mxuHvQ#mhIKF5XJ< z`@9g^C&qL~)rXqHly#MP(cK1-vu&QceiCQ77w2x~Z*bNLxjfe&&Vrba2Hl+?-*DlJ zqD~4TW|S$tFz9kO+2&%$%YD?KKD5i~z_>o@IR2|a=WTehUqHb%@D;5^xUp%L^b%op zW^jv{+-4R$HZSnvNsSeFi4~c?USTEXJhb5C9ks3uYjE6V6;^#{w#sah)z}2IDoh7Y zn382M>`;~E+p-`!vC9G>%k3~(^FlYvc`7ZQCjN$W)B(6*l4Y`%#Nn>12D~*ei7j~E z5wUNB(!(FdMiCwZp6t(10;Rd6 z?dy9sw5$XBP}?i?AWBYad&Qo~td1sL>lK;ZBdqYb$%;LLm7sPWS+JrMPkzT-8TM6R zP4!U$*W`tRV$bdsSnUs_S2`rRmX{8kUI}(ioYt_Z7?{B3^5a%>c zz?(U}H!#&baO-fFf|uk2_rzK7n4E(p{XA0+IkME4SJSm!3_7n9=FRRK8NA3HdkutS zCxU3W1F-=P?uCMT44hneX$sL6#26FjeI4vN{1Is20ZhY_Q78hfr+re{(|dZ~^qZuo z?~?=LQ2RZ(4^Z%h)-YuOyxGfhA&u=Yl6I6RA9s$v00MzDUul`q7T^5>f+^$Sx|js< zkKbF^NFu%82y+dE#|Uf2%3g*!K{&O->L5V{Mg2(k#k!e3A(07O_A;7Y)=g+2^P z=Q3K#Poi}3CDa|g#?}a#dTKxIE{wTo^BF5OFNcZmh1p{BNP;XMSutDgTIj;_(s<tXr8!SH2R&>&-Op+WlVFXuh@%MViBBTVM&iV2opXRXntvc)^c`q>~7zzE|@Tr69AFp1L7R)5VN?=$`M|{#^4Ro&Ne!E z5K7Ze(%pt5p2KBjEGk%3u|T*G6Hv4!m9h}ABhPEij1(UmmbD}0Rjf~p^-};!b++OJ z3O)&ioOs6QIGD{rG2=K!jo42dvuG6c=k*uJlrf`sPyfJy$$H#J4VORgFs)=h!o%=a zfCd`lU3jv;fN%G-O-@%<-868|Dj#`e4o)&gDhi zhjVS6*{tvxv|R(>vGh<$JSZJ%3UqLd!!!Vy_j?4|oYMBpSk3L3(62IQi-=$5 zwI22+W)Q9{Xw_MHi)8lz-YN)h`D73GOsScwdgpXL1HI{<)id`|hgs}x&Mj%5*n2js z^=vkAS-Za-zYOD5R)@Fzk(I|}|MmxEpFnB*I?9bp`EzU%rF^cjsYiy=M?WBJ8pbLx zhI*WYdN$V&P918{nuC$kP@@#=KL_>mP@jbQ4AiHfo`U)`)ODzzhZ?1y=~+bikGL*8767N9?~b4qhk+v?8+d2H2EIgmgS=Vt z^9OFow|UsTu@ZVdF1-eX1PBBI>j z8{FxW0)jHQHURSP6cnqVgAc6=e!l~smANRqK?H7*mf}MFz(nNj2c~m51pcL9^EOIH z{GZdhjq&)yER{iSPRF2fw5xjxj)Hq)zRz1O`U!HHdB%NjSD@v)Z-J~M;>8k3NWkZ) z-w^`N4)R{sNz+8+^xlp#tgH6kql@&F^O!e6AuHoEZ4jz{;wMna;*eZ*U$_B!AYr5* z0)Ej|z$DB={S(P|Q%Jbo>3f(7@%@9ha(2G^F={xZF!ajeB7H9ah6iZPMkJK3q6L2D z(BZE)zeukuhC?r0TVDs)05;nI_W_nhS79%`Lc1>=U%$e|{E|BSXV(4r__{G(N$(~i zWFvu>jxQWVz*XPxL>(24z<6-fA(F5Dg9pCj)#_4jOtY5vkf^rGH z(0#G-jPRvC521?r(iQD9Khc2Ou;G!3fFM^%JqZ9Nt9viHA^#|76Rg8J? z)feFGgQS6b&Nhy3LW&N4JhJ7B^x7bS9A9_SdoVJg)Sbpzd4?n8a1~DoIAbIW+zpcZ zke*8us6f(B-<2?o+~Ji(o*4!DQL2Q!ZeOw_c_K?AToC8cvz?>ci!4Rn1I45>X)6Mf z$;kn~UPF&_Gw?V$8H&+uLb&Mi(g3~*@uN)S!euN-#A_g2Sonr&IEWaW;e8H)`BPTL z1<8t11in7$q?tf^D4jcA*x}cNNQ9hCxqzTmm^P-2vxA=7S4_d2>y0_-;A74WEE@$6 zICE3MeTBmP1dq0zeRhW7XEh3AOy(jgR6pWLO503M#hqv*r~kTE9mfX7KvI{<^4PaM z3y!4`^1TVK$Wkb5QY%;Vno-kh_MARrI{K7u6Nl8u6q(WMdPR41^3*X*9s4zX$}kP0 zo4WbbB<2&_FuyfT^BdDLzcy|2v3d&TS7y=tuURtxLmczpCOo8U{)<%Xe-@v1XTKAP zkEI7hS^S$oF`l!~$l;2#Nfc9290zhL{2hS2N99n0_t_#kRGf$iO!*_&BS{=f3%MeY zOFs-!dH1SZfD0yCzx*2Bch zrMZ?Qp?DdGe~!gnEROUZDKGHCi5xk@OGHsKbz;NuJtl-aAqM%DnB*JVB45MkV^XMm zHFu`=zS`56lLgmhiSN1sx5@(2;4oNI3P~HB7Q9*M)HnB}V(HB|ZlVL?{~ZF)upjy5 z{-mPbDpFe~!M)=Qpe2I;GMWQUuFzh^qfk*hJ-R71M~EcwBGkHgG&8(IHIMF4%{9P9 yh!6yU_O2+Q9v@?Y7FM74UAB_N%l#Y4B06A(Rx`C?p#m{bBv?W;KwUIujQ;`2Lx4U2 diff --git a/prismatic/training/strategies/__pycache__/fsdp.cpython-310.pyc b/prismatic/training/strategies/__pycache__/fsdp.cpython-310.pyc deleted file mode 100644 index 8745ac7754d221e010c020d896058a5d0efe7a65..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 7828 zcmcIp+ix3JdY?HQlEdLu5+%#`v3wKlh<2RIUXpm@SavRO6xY()Sa#F#h-V~?JRH(z zhIT|y*tRyB1qyFB2+%&H0VJR=0gAp9=+piYi@p>%Z*8A~ZoADU-8M;^^!J@1CCX0I zzLdm#JeTjB`OfY4edot^v1n=dyzy7RT>HCcH0>WLGx{-*c^Oai83flF8rL}s^oEY# zRKQxPMoOnLBQRR&Mp~uQL8g^$WK}v7n5|qR*RmRx%CjKf$g4gItzx4zC@Z%rjfzTV zgR$0lW4txdm}pfSRaIsNldY-7RBK;jUu(KC-P+&SugY^MKhQX+(pGS&b+~a@rSrj& z*3rh%mff&bz7QO1%`|3Ix)>a9ooJj;=~8gAb*gbn*Sr(n=?8ja_En9SdF7$TE8eLG zsYZ>nMXfg0`=zxK@lL(7VOf{k!n2!!8$~wvR{YS9{dQ=tw1w^3Q7qipTivkzRwwXU zUKsn~s=d?}%{4o8WB;CKU+o6LhP}Av3hr@x-i=-Rx+~lu@B({DxabcpX7Bs)njLjJ zowkUnZwoK-=2od5t(xsH7VW@p-}8ihU$~tPHR`kjzv+9C?S|ZL`R_onBfO>`k$oL# zHruUEJA_fSy7e*EQd{^)m-)9~i17Hl-;8Uz%ywK6`)+`waovsAYD}7!VV%1ic+z~U zLxV+aN|r9KdCfbWwjai?W3ciFtOzz7w~0|=r0vjY?kK=IIlh~9l*yi`8FGTAV=>00 z(fcpA!xev3mfj@euMcgP6*XJvWN|RPvbfV^X`|!8)@e1ZgIT3X9nLB0(wDf~x`lyE zu6nV9xr<^qjGYJu@h(PnG%1cSwL66lEr4_16|HUuT@X&(L52ASv|knl7dBCWx27;vP#)Vje_-0;%tnUyS0ds&|G zOrG^}+{ACrvpjRZ#vyqgPr=LcBAybSGIT2V9rFr&9M1%vD*7w(Nwg|?<|>Zr3fr%t z{S*LbuETs)7_9o>XF{=gl*B1V}aONcHzX>PtaEjPmTSyJnV(E z`&APkr{}SaeRNQ*PMouI;vDQ%b(rNePFI)47CI}~?Yk;+^6@rYy}oq6E$(1kw8@;6 zP>!hyqtOiUMV2v1vFC6cn#AOc@Zzoro7IucM(H%36rSjBA!6-;wvP4o0oQ%~eeAhD zgG}{PkVfBtO!w1}nSKT`+s{IpeG@X*&p}#!3o_r&Ll*i4$YQ?;S?ZS{%l$HBrC)&@ z>yN#!|4i4l{x~1M!^Gv7t*82$V}xTUnLx=kl%!RO8E1I4KmLKnCqGO*(Em{HPsG`E z^A3yNegpe8DS>7#w)j4Lrynuo@?69IIQ+QG|VR!g=jSuiSWj-dTL@(v5kiaplHa7vvOqD(A^9QyA5i)=f9)dRK&K3sIuJ z%2$7Q?V7W=bZO~|GynSKC0R}S*}Km~o^&fom3z-~l+)R$RizPm!OH*7S`cQ6GY}u? zGUqtT_d1TKqvVTM&#kpv-Z|0r*Sv7`z8kKdyBE0UZeF`I2iWh#o;cSTV)wb>kvumz zM{t(s27X=UEH{?!;Wer--#x~W@wg^ja}KCbsBN1?DjTuYf1-4I7v z$ov4{2<~KB0id*$(N55gBJ2|EhXhy%k(wEpI{eTg=I~s`6P?lNu=jD;WA=bW7LJil z$0%l7IE3pu9s^H$JM|~`aYRwFmD$!dGo10%L#>|{Gbqc3IA-hSy!Jrf%=R-|xovG; zd;8Fj*`~Rf>t~Uk+_d`U7C-?<*Z7bb+Gc)R7gzhJ?OXdb)B!5C^%u3fPv1Qo7q*Jq znt0F$$Q;x*3w?uUx|&$-7XUPjn;#eenH0D1oBxnBihznI>!@83I;9V@q!X96$^f3M zulI}FOxJE{JY&H5JfN5~v-m{*X)XHMeZUfZe}sp0YL(vVA~v(_+PguD9k=ax&9iS0 z7U#B2ldnRrL6AnZyD%K75Kt`^t&NR1%58brf04bsL6M@{4dQxle$N&FFnA;G#70or zZ6Sg~ltF0W2EudsMiSy7obyAQnjdRod3QbxDGJdS@k9jqJ{I)t%q9??@cw;v7a^=V zEP_-4HmrVL8m&9tMp8=WNW1_ctry9mP?&hJg?vQ&E2Eiu=Ih@1Z?n7pV$g27L3FXc z+kw>UGQT^W(LxW7-RsQB)cNOmFFU7pH{~wyUSW>5_e*}bQt!mQOp@!R>d&q~bYhuq zwYeA6D$+tQtHQUYr=Ug}WR}7P@jMOVJ8Brat0E_NR>VOk2=^R}JCbFDw0jEDe(bd( zX`!izBZ~fHL3k|$ckt*d{yQ?a6X(f-A0add?>IeAv}HP`@E-VwwXBuIcgc(wNxVej zB8eZ6cp0KGp&|f8JaFYh2QJ6d@J31^nSzxvt@e<3g&JmC13OB)bE+_-Jd*DqV1Il~ z%a83b2fam4K~!^7DNE1jB`j|xZR%AZ6#ISlm5H>eAJXj<`+7WG`j{1GKJ6X)w&jH4WB92cLC8d9(GAF-YU|pT4!lVr#l&-vsn}RgBPgJBCaO{{WRTfS zBQ`15M|i^ZN2$#;G;*BX)5Wu~wUy_oZJcJ5Y3qgc;^4Q$jYlXAE!8$l{m_%NkISUh zPxYy%3d+Z}#_{i4LK0}>-P}|n&x2kPY{utMQdVj3yW~%fq`+&A- zhBOz)HciOmyoj0jI6l#*+{r58Z*Emv)fn5fHuEUcwoWN5J`5}_Y$60(KOOz$-M_%^ zEHB;Bqrd4Fwrac#EdE75zjX%4h}l5BHJV00XL;q%Ss&L1`dww4rTD3S3FUKq{1I$* z>hU?`>cDgJ0qk?mQ*QlfAb)1Fj1aHPCxGpLAKpayabCT{ey)q>`Q-m#t-WjQ&-KW- z`{mGHoJt>8>9Z<*LZx9LtT+k%oJyTir6*PTG)j~A%pzr~vf6w4Hoov+6b7GJ zz?D3&@WM$h5kN0>>loygyDKYnRiX>VMLLPSq6%(_SBSfz*BeLJG#7z7xbHXD<`io7 zN>rxQlKjh;-AFMSHxp9f6}Ne38H@t?$}_LI!HT_eX+LA%^IiLPJ#NpccfM4w*K4xYTOYz z9|c3AU}P@pE+@Dz&Z3(=j+2}y&BO$rr~p9`6ZVS!fH)t0p~l4PD3$vIz%?cyj6lS+ zFo}V?0oe@jM&Uu@BRxU=T}NsvP{!Tuuc>cxy9K>By9YxltcWo6on7ZSY(!MW2&xa* z67(k4&{OX~fHiZ)TyjZ0+iVkaF{kdfAJdq6>H4zoM(8DzOpLnutfEGJFik8FB0xr( zLpDUHhBMa&*Jk?$U0W7m1i4IGu~D>h?AyJG+rxXzfS$q)Mru7ygHz=o=);*q?X{R-Xn?M1Y;U$T3r_Ud2v z#N50h4j$7xy;pDEN5JndcVifEt1SY))8SKl6<-A_Kv#VGQnT4@bpu6YVJKi9==I;X z_Nre)n6`5U)CBBwaCwycqPDfYO@A1H3ZyAhT-WZoA=MG1MrY^xFrxfFY2TpZuc*nL zGx?YqqkX@k;9+maGmnjSK8ay%MW@-J3qzTCQ^mT{SZ=oiLBW?aI1Yi#F1tebU=j?lnFv~&o*yJUiDFC?cQQlj zDe$_AB0+G6dkjx7R^4Z0QFwQ|zVISNiAf{meoLBO$dkA^111<5g*t+QKSjSFfChQo z!a){dTvyUJZC@-MLpbBMA5si^FP8>G|GFvIqMl{9FY#VyzXr{7X1RkV&kdt6hrWy`Pxcb`V@upMgcjN2AzSgfjas%KhNY#nQ2xr%+!=&A@I*I zi;bNtqx^Opqle z$PyD|i3zgA1X*H&EHOcrm>^5c&sfp?)WE|^=5JWp`o#RYH}%cV@f#iXWg0QvbU{zR zT|IoRc9;CFUeUnd~~)qEI2jLL&oS?}N; zBlqgn*aC<=6=f{cCW)z|1$7*0Im3&lBMXjmx9bK;jUXTqcS)2;JV#=J#2-M&6nvJ* zQYx*$Mf9nPu5W_w<%+NpMDL0QiT6qTDTFKzD6*!9U@4)=lHKz?QlX2!n1UdhGx{S4 zQ)A_1cO=sI6$A0bKzuO}Ukt=o2E-Qw@x?%VF%VzaEWc$Sz8Huv2I31_2kXc_VT$-- zzpjk4e_<2spIMdticOaOiM@cm_r++h8T86juh!>Nv)WI_Gs>pg8JTq)-fqI~pTIOQ z2yOf`ruN}#3U?Ju%c&>(B4AN$YT`18 z&rP;LnNs8iWIk!7a8m)P+&>bAM)d)psvsqSOa-iyJA`Ll5ercrm=~#?B`0?4cK8fI zM@gRDtx>@~X0g6Ylh&8vRRL!BM+ARwib=BZ&q$D$Q%g6(JM5k%^Tj0izDVI^bPB@I b%q+kK-->QB`mO5I=|YC`1(i2b)zp6j^rXo~ diff --git a/prismatic/vla/datasets/__pycache__/__init__.cpython-310.pyc b/prismatic/vla/datasets/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 4feabe19fe02051d1006b85c2a53d76c74f44fe6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 287 zcmd1j<>g`k0vJ2|b>A0&#ghOCzVxsl`6VqK&Gw`FU$X{SxyDcPi*s?+Iz&_AFv?My%4PnpSwR&8d|X=mERv912jU4R5> zjXMQr&)&z``#AUBbAIPshQ(q|!SAKN`{%XUFDuHADKq{vkU5Vh`m(AhOtBQEGR;>5 z&C*m#>%JZsmLb!Ip9<1eI>=a=z_iRDYh`75%FkJO@TUBNRgmwZRm3~(mx2jvLh@w% za!|1>ewKNcLfj>~k> ze<(O%oe1hyJveEd3{F|6f`_e#gGa1Kf=8`KgU77LBwxvYJUDHg4jNWN<|q7L3cg@{ zA$Y=iLRDPtMTM1F<&MHCx3$}<^(69DHi`VCTfMDY^T^lO6!KHx@2dqk7DpPJ~Z_dW(la!jC$q=f^oJ zcH(**)-QM6&TE%0JW*eYUGA*srKo4yl8$QW0gc&%Xw{X%C` z7zW>9^5-EH zd{(@IF?ZwVd!El+E-K%+>2?kj;q#L00}Uk8M2Q)5*R=zui}b{WCJAk?;f5)47hYId zy1Z;JEWIL54%1hzUcPd5>9W0i`RXedE-hIXB*)6Pt}Ke_A7s0(~FD)-DzAh@uufAd@oY(9ZzW(a+%OXE&lsd{?Te*B?$eSZx+1xeEaW-%^ zT-)nF@V@KZaG6a)#-O4Djm&vG(H#(1u~eL6?N!CqU4yA-6*q;qhIblo9q){5 z&M8P6XBC#RvThFPG}0L>FVm(>7i2ms(?yxiA)R+i?u1)zX>$rI?5A4>E3(oZ!>X_e zoQSfh(tdv4iJNPC3o6QvTtYh@Z4Oadg?z^H6CQ0ddvB|YN>}b1do|fOcB``@s$K3f zY*IUl!*25NMKRGsLpQk>#kOoibRh98I=_gQ4G(%3|ryK#?qni?cG z`W-_egXaZ2(SM_+uQGLxC{0?QQ#5U$4fI&qQBh`~ER~e+ssjThY2-3<%8s_H?C877 zAeDSMNcWW;gPGvU4BN`G+}mh#VD^>u6#x4e9JVpWrR-$-vTj!!WKo{!>%@s%uAghG z{p^e~$g?~v+)?`Z1?9FnD8y#JfOnCWqo4Xyg-!gK&dMKZtOCV`cUv2j`lSWs+l9eI zKeLnVPq0bMZGzS2lwGL&>+2}ZVbo>xluy!ZicN28>xDrDtygZVY-S_Hzt^v@*=H5d zIZ2N^tHi~m_YH;rpkL`1SS~Ko{4mN&XL3|Ks|>1fX?>z!=vUd%ZEY|~yfaFFlKA2> z^)RT##X+(2607c1Fb+t89b?D0HOLNfKweMsB!@%_=sEx3dH3_QW}~^Y?4bC$^Z&O4 z^DmFheKfM_4o1AI#`Lc3s@M&cK@F=@>l+x~R6NzMv0T5%usV6=Z5%v|c=~-cIxTCC zoK@nP{uJh1>udcqduTf~m#?Fw0K#XKyH|{nq}D>(B1( zxH;ML9C|*|pV>LmKeDT;;J*HBTwOofpJ6(yZ)^Q&c2bVx7~0unr;d@{ufh*F9GT?w6ipYnnk;`v5q~Wb%q6Bbo1Bb&PL5FTwHQ(v9 zd!!t^P89Q=)FdJk!ra~PoqN>Ea+B!jrz1h&<{F;p2uK_2@a^mXXBKib23lYHK);^8 zu3b~^DtsOc4P9vSPl-&_TlJwwgyF?*AZsM~XyMFS7`SJ6&s%dl?VA|S8FYOH*7nH@ z%;`eypTU?T$i!>Pn&Et-Gou8+fykYu#+$H)&pk&&Bh7O}**f!CeK~L2O~?0bd;Y8+ zA{9M{#OPa*TI&T)=c7lzxrQ}JWLb~a!k*9MD)4YM1R#J#qU93mxx1P$2snt;4ae`f zLT`6tq42Mud;Sc_%3W1d9mebk%jr3OqFAGQB*%N7+`oVS-ls(H+=n$*H!e0*{wV4i zDN*Qpw_M+r?M5Q&b-F!hVHSzB@2q0!`DyTpOftwwq@O9OW}cQ!?uxlQaRk@CZvF6SpvEr-F2lzND`b>Hm>lcqwdUF683CT4`6Af^Q( zv&5MO)?g4xrnwe+O*axovheGn*AXe~U9u3d+lfNX>-1HP$SL~NAPdAFmdOow6LSTQ zb307vO(*7$puBNR7?JC@_+#{u6hW!k*jL15;R22b&!gPbR(C&9z=uD{m=G2l_Qge7rwXB<} z@u``bR!yy@7Enun1+9vwgvUg#rkB7)yan(YSyKnMT1MTpkyER(uBw^3p_bA1G@hJl zyj4Z206s(8I{G;Wc)3xfA(P&-ZIQEWsKg$ipe+iveZA-SNewU4+$L;*y-pZQfk(E@ z^C;ydB7{nKfk?6kSHsYkdr)q;Ep+S*PTG$%BKy*lDH0(+4nj{vdpf6>n5l`m zjQ$?eB}WsL2Ywag&f__NHM43~%j#^F8LrLAturki0|*JL?7fGe%C zM2oa!s^17~T0B$x6kgej;VD8me zzoGEym;fEX<-qI!{H+6MwzRf7Fqz~5$0JN1WIGx&2H6lb#_H)y7LJOw;O zHcKnC_4>F+EL~og*Bu`AI_v?CuXKz7-|IKMcx~iBpBdo_TIl+v(19Nr&9@WD|HnBi z<@n*60~-$GkpPFB@3!KhMagq=#*<7#<1s$tFM?cLf&HrZXbrYDfX8g#@Z6h`m6Jux z?LwUuWO`_ltfI_~(1>+%s6~g>c85Tb?`_G`YsYY(`IG2JWO^Ns02AEo{7ny5=ce3M z*4zP=ZjV=$q7AQW_d3#uy3CqwhcL_Bm3dOPIOI<3wm0vQ-he*|?J9S&+^*G5FR;4~ zhmwVDCLj?tvSJFKtnY9Bp?4{!Tby*$p5HYppQK|04|e2*l6mCUX+EXM2@oL|`J@(( z6=c%fFPJ{mQ@qAqhecLJmQ&gu7IBUS`LBamwY|d?keN(;-}XQKBmI7I4pU4;mJfOM`k&wty zBIIcDDv@sznIf_bBGSDs1n-K1j2jHsKx9UV(r8r@dn>A=d_+bg0{n~`MgD8l+@Wr} zR7UHU$UH*IDnKhrltI9c-$W5G8I<0r(PR-sW*B|IE+h9DLTY*KG0QTt%M{$Q?eC$P z$O4(p=U`C_dQMNnN@|*+YGZ%7pKDplr)8P(v6iNs@u_AcTx6$F`%`18pjK4_IkbYO z@X?XVR&7k}11l#TAMoTg41lblRL3PGVuUn3DCHljzpFwqY0Gi~K(ceFgkF#``#q!& zO^m;XvLE4*aJ{5%&5rjyzDfstkWv3+MI$oV-+`K3N8CXAb|!ogL<#y(psy+LyweQM z%3XE2q4VDYuc$bYjYHhswBd+g{{e*3-Wd&3!b)k+CE1fdcy0T~)FW9Y?X8kpQjPnc zk(OjTFk14S=#kP&BSB-mrtlAtgou6{ue)j@qC?$gY=VBa{V!mV5=yIElj9N^)&uae zTc24Wl9KF`#!D-7Tm7zzwb{CSD8yK2A>yEbMs<&L4;`>Nf`d;+@uu(AZ5u$JTsGVO zT0L^9faGS|b@CXYZ%)GHm|PCyrP0SOTZBmYNT7d&?hmcI$d9%9@1xn7>ekG`+3od= zwe!KYak?*h?d_}w2VAIKf^)$RMWLjaE?CCrLC4_kFCQ8yYb6&1ipdN(`#LVL{$@Sub-N*t>q%Vq$)UcizX^TnMsLaiCKL6V^HF#XTTlIed*+gy zH<;x{_K-dkQ2H!v!av>DT2i}__Tbb}()pp?`h8UV6ps{ysTgp{kK{R(ent)DPuj&L90Koxvkj$k&m8J4bFDoHICb@_ znf)9~eW-b=M_-#{+@uYUchFM>St3>M%a)EyeM z&^ld_!aZ9UiA)QwN&NJegb@PI^4|e>Bb6wR2YflUI&y1}0m{&+AT^TC5YhtnN}W=* z`?>pDQ{&Rx2U~OwM?Xr_BZWnqfF%&iD?_kJZ-fdD_e=H-*SpDSL>#4ds3!?54k&hV zu<6Xpj(&m!#l>T7UB_eWq{wk)+OE#@nt~F!8i zVHYpV%q)asMOVS$T6{%6pwkpKdmCaTTt;5>NJFl2~PQdl1*) zzLV2m-PW;lCYSI12y^CiDB5^{OT+X0ukhx739>cI!g_=_b-V@_u}+vC7ri|nk+>&_ z=a-563K9G_fWm*3$XAK%2ZA{nOKIp}dq~Jh#B>p<#*_@m@|UUfZ$J>$jglbWHz;S2 zdn8glz?!e{za^%VMCA6nOeqli?pyK?J5w(2k>FJ*LgVqd?RUkwC-<4MY(U>!X+-5R$ev zduTWA13fs~)WJT!qjU4XSRI|L{{zPT5G-jL(lI0e<6y)00bE)FxZFo(tzdBCY(6$ z0yR^lW=?+2H$Zj>#L1|qnA_*0;mFb0a$uYcuXt#`9``Zx!+^}*C36y0ojJ$^_Z7}^ zCysc9##1FSNks0xDN0QfnIS@V1=i7n4eSFtc*D|tlxWR5@i}bsxYr-^{G}T}0A89u zxw2!*QEpn&%m6J(xcE725GA5hY$ye@pw1#-n*?pk(vByg+tUb8msQOG?UmoC&HOKc CM03Xg diff --git a/prismatic/vla/datasets/rlds/__pycache__/__init__.cpython-310.pyc b/prismatic/vla/datasets/rlds/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index a671810126924724fe7210caa9c535b2172844c2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 252 zcmYjLI|>3Z5KXd(B5b9FSJ*;x0TIPY5gWn6CImC=29w<-iHb+E@hsNcT6qO4C!*lM zy!pIg-mB8IOE8wVdojoQh{Jy{9HyAQkD!R6hFH3zmhBiNifu@iJSkEcYX(*xK>1H0|$!iz+TbxDwZjeT3J77nKF>j=F#{Kg%-bCnNcG uH5<;9tWk&8PJ1hZ-hY_yJI|d_o(uG%gAk+o(sahQQbz-m=+#HLK6wMWmPEz? diff --git a/prismatic/vla/datasets/rlds/__pycache__/dataset.cpython-310.pyc b/prismatic/vla/datasets/rlds/__pycache__/dataset.cpython-310.pyc deleted file mode 100644 index 8e27b6f4e180bb1b309e5baef10cf4062a2377bf..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 21911 zcmd6PTW}j!nqGIKaUlqTS5c%Tds>t)giJw}?YVJed1OhpHMS|QBzwm2x$cI{E06@$N1l@z{HO|zA*q;;xeHZ!ekCELnXa;>4tP%B@_w}vajtwN>1?~Ue2 zrO0hZD`WgUUYX$U$;uSI)6GMz>B=ABb6hqqm`qr`z!ai z9;iIPb+XMbwXBNOI#xNx&$;GY>v-jO>%q!{{5;frsP%B=;npLSM_Tih`Bte?YMrQ@ z&^2e;dGwZES#TycPVSZ0G&_ICKnuejZfFh7F5r&;;r*$N$K=%>d2iS*I;ZVXe8!x| z?QtjXWSuA0bmxh>F{{mL_5?~Not$$TZ%)r@RBBHRJRfp~BlxPMfAFW?(%vQMJE93MT^kJ@GX z6vqA9kYQg^y+4MUS*+f)Puq{bovwVHU>eTiiTBLdES|)ga}1#zt(zhCWt0s&PdZQ1 z6Tb_e!a8619Ce~u7VWRvU&AaHd3V45o?$Oy6|{%n!1oz^?kBj^^Y&BrS=9RmuSVj4 zgyPfhW$bTaKXQP;IjnDpV1s+=;~7Bi8z`f9XVmw3d=n%u;Qd8)|E&6!V@=!7+0Vb7 zt(>uqt6J$>J6H2|&8xYNSKi#t=g)V9V>Q~IBkHwP$8uLiW7D(h9bv6@+V#d-S2Wsd zR;_KLQp0Q1nvESQxbotKtCs4=E$4qqM7>nf{p@BQuUdO&s|!oUb^rdzi{a} z)V^4~v~=O(cl`0CS6-^hnpdmmpMPcP1%Ei`lm^OQy>|I!S+`{P#mL;*E+RkRXA9&(*%V4alrU`wbos0Ic={-Yc8ezF@U7ATJxN$w=SHT?fTPe zj#pKHth!Xu^%|?Lf3%H}2-;4ywo0w5*n{e(=xmBc$DiE9S6eo$wi-9Ru5kRECmg5R zs%`qI=U#s0+WT5**e{~Xwp|l8byRQrS$Dl#uLJDGMys|4Kx1Looo&}I+Rmo89($&_ z#NHR0wf0(#_tWJ8x_pT)7F~|v;vZAvDu`8+<9pkij+mp$ zB!R?(ba{v_598uzgad%KR}DPS-v~a3@X6r=Ii^kB`PPkhv|ZCqb#*batL@?b zj;?FBwcGk_<96z{c{_bOb30ql*`}R-JJ%oe^bKP--P7&NyGDP^OZD=edCL$Ny!1w9 zBfFRDbFbJN?G^7B{RwZhH{qGRv5m1?x@YcYZ|S=kp7pvnzBjR( zxudsrd$^uLjY+%kZmOU6ro2NN)4lvHqo3~;xbE+JhkJ!vI>sYuGcv1f%-F?Q?RrZ5 zo;Pcc%C|oRry8t*vhg3KZ|T3Pdx!R>dxbke9}{P^o+19op2X-={mEbjy~&;_SJ0p8 zO@*uAd+$E)NH5Y!HtLPl1V^*#qSLZmuOoo)%=|#59yMrq#4I~j4R1rjvUyNq4K=T9 zz1DD=wu>5yMyyWVB7WhlqQ8dYmaPjN$FT0#wwH@pH%deay9*h~@aI6k+lgR{( zm3zKax0YkWfsU7HQ9$h*4coC7XjWT|X46{1WH&dP4ac@HN2j*B9*w+R9srBG-sv`N z3oEm{bthJl#~fHCK@F>0u(sA4=v7VJ<(9-htT}KJN8I3f&oRj6Bn8fut&43>Y!B2> zqtlFeJR2f{5x+*)_A zZ>lip0H`S>D*=IHCz8t??3-w`7HpG*?p=o4-!DT&k-sD#;s7y$%p9O*HL)LrCZ|u@ z4Irv3i9-p!Rfu$AC=3fs7a$_tqm7TE5X%JyfhX34sBOo|i9rJ5WIQB#lC+^Zo}(_A zuMxo{D4#H@Y@G`_R!C|&O^4{nn%{9mrxbE4iOL{Yuovc6JT4OfYKv_J0V2Bu9@LsH z(ShWiHEWJ^tvO4MP24iBtPJoDD}YDlLK?gSXK$2?*TBb!uzA>vlnim;qIpBi_Jw zi_nZ|m*-05u+u;@ig&zO^T7NiOQZJyj$jD{n9@Y(kM$^Pex=iCVmjYmcQ6YLdpj=;r~f}iX%s6^e?8$dbI_t3A65$A0Zh)b8R zL1xpE<8{@xFAzczM{LWzkqfY7R{^Nh2D)Ik?d?dQLFSf@g^P{}0tixTKCn_=f1S|h zjRnRce8rY=F485ubrOPLvXSr&q>o&gaJ_IShVeSV?;cV)tS#9`ppaYhFG-p4I*UDg-!NiMlzznhstRlIRa>H|4?tG~aiU_uizCouo!m2J4z%tr0 zrAh%I8N)vQiQZk05NFD-C=5*EQ=!Ptr>?ORF{)xnj2nB*^v4jB1KnQ;(Y?6C z)E5{^0r74_HRMRvT&s3-E-@aFSs9FX+hWZ_VJ&tH9t;Rc=v^cju{}gXL1}35$(b^n zv^!g*M>w?>ge)1C^G9qqN(Q=7bOx9zPcA~ur*cA4! zwF!R=#H_@|dNU^c5AxQ;I1woYNh$cuwq-l@8q{*I3K*Mmdm%2B-U((6jk)H!u$b8X zTuF);@{N%?s_angb)&V}5ip}AzlFx#wwCK%@?`}sq%Fv_&Q{fh_sTkJJ+{24MkT`& z2ucHb08~2xJJ!fnoNd?~OoR@;m5N1bn$&_A@ne<9eP?-C3*^ft>2RkHX~T5wMdhWq*bt;VcXbDWPRamI#N5Ji8O1FF09k37Ljx!-pC#l^3n|`JHd(|3)vBH z7bH-gz^G5f3ldy{y@2;bxrBiD%%Kze68RT`F#{-vrZVhdtJPRr_t+P+MRf zt-&1*o~~dNAv;vS#BdGo5#WDws|In50aiVzv5yhsK3%pxAK`@K+7ZOTX-7y+^EtI;)JOrY5Q=YM_@1^db zY(|57+x$_A-P`6~dRMp8v)UcF_-D0uP|HSu@*M;v`*6AMW$tJfv^VbmWutHOQ@s?% z)5P_DETAwEyutmXboc{Gh21QAG3_Zh`|*u_MEEk`7W>CKJ=dXj*f5&?ZV%9 zqq`=;6b#Lcv^~v%hW~-F4$C(fVeC7 z8-=*;DFG&TNkpk-bK1_$O>uUY=49exe0N&gIm^mnKpUj)RH?}c+Nnpa$HfXE$x`8`H!{0U9IZxY50e$sZ zjmqvAzhS&dQJ&P&Piffqbw6#B6;?8R!>jvg(uQ0=TSsKad*X55T!ns9GQ=s=^;1ya zu~}K=w4gvK1X=tmzXhtUY@zkedM&_fPr~iIH4|^3~ zL63wfwVMr0RI&R?F5uWZBQFN57SpwMhFMMIHCj7Iq@ss7Ppb>P4aTBZFJq6fZDoHd z;L)%bLjGMc<<6L-aq@Qgg#crSoBs4u{OwpT0!87rfV2n{$UyZb<Q%i? zl|pD3&SaNnTAW7JN>;hC{1k5d6p%n%K>7Q*%E*} zkI`U{(1jz-7Vb(Zah3|GU-2|u7i_3%ZdpKQMCnzb%fKsT}Ia7R8j2GVqkwGau!$(SUl-yMzMn0nunFYHl_7Kvem{v9C8IJ91#Go{uM|`?Kj*>* z<>0$X#)NnlHFu6j$m3pbYz~CjCH{!#P*XgQi*HhiDA8>Sh6e~T*K)iXA!TJE(0v(0akfJ;*)`#i1ZrymHqT{&(fE<;dX!_ z6zKI0yXogHyeQSr%23EK6(fK~D-b!x3>)ow$IqgF1cg0+2%%9IF|$>NK__OZj?v!q z)2y_KYj_qf(dAooAucBl<5D^#(T3p7>%DiXf;d6-=IC;qF3DsB@j8_`xcGX#QXmI; z;0h3LP-#|Cwd7?Ls?A6_eK2wmLc3He_njQD>pr9STs}1c+&cQH@!gyO%mTjV4C51^ zr%umPy794@&iuSEluO;mcsRy*IAIo2Q$_(eY^ZJia~NtG^5Exv78R4goqooq?>fvrppdp-l9v7E`7TE1G@YQUA{|~ z@8J?kL6RnvMM4TE=@@PXLadh+$VKdF8^{No)ovh%gp^BCBzxMP!8s#PB>UQ~W@n%h z9i!6Q8dTCOWs~q5sF`^4Wjn{c451fIe3|Q!>Y5Mgp(F)mD2?0;qmP^mZgrVUGhWuq z^^99aU2j0){IULhUDeWsjZ6;e-wtaIQAP;Y9FEnb-ZSVwkGzevJ&eo@IgT#A*UNAZ z-%~w6Wkg8mYH+c?L`3kX4XS}pQfg&9}UfdZKUo@9~ZCyMTDTTMjy zn`P^2SEf6tBrSFh1(6}nBYOW?t3lpVMZ(JgW&YHgCxPz(SgP5@!U@8nj)bz;t8H30d|NVTNY zt8MmvBr;GIkTT(t2cyxx(P`eG^|p{|f-iD!u68!J>7Mvb%u!ECX zF%B#v2An~56fmlTLjZ%`!LT-F3pXLV_c@R~=-O>}20e?wT?ct31LMO2h(*i>UMgNG zJ^P8E%DF_24E#YhC_!bhaD)OLTKXnz99%H_)MoANRzbAW`}fF0suMPfr3GNIP8?1Z|G9fT#Zjo3)ytQIO@d>axo@7Mv*Z+68(7t6PX1Jdx zCA{L1_c{Tdh)IP0@yL(;xn0=eJajd-7I)4jN$sg?0(lFRDiFx8&`M@mKS)xcCWsE< zNYaqdt3mmJCP(Q4vzD@JQ`b{>$VFu?@#u0MY@Vg0!2@Fo`fgr0c6Sxg zvi3F{BKl2f1CVb@+5ovT&01^4uAMDoNS6$h8ExkzX$adCwp5BlERj9tEmAF$J~Enl z=*c0oSk9cUDLMZ;@RnWIdG^Ke6d4gA0Wf&Sg5jpIt0$xlu}FFi|p?j_*cg>_l;V8D`BOiWSL z5f(^fuuyJ|KNL!g(ug!b$awzZR*1AwS;5^AqL~!C4Zhs)5>j0!sBsMUy8YwNYs# zaSMVKWMw=Yq?is&(wZj*Lg!f$j0VmIlGg#3Vuh1iDlrOd`A1j;AkQY`4{$$7rE_u* zF!8{$ukzetbXg@-BuqE+c>q`Z26buT$P6FiP)t0upW%+8;~sSJ$E80uRwU8kAJats z30-~@myoK0sK^isX&2l&E;eEzp1!7gaL^KiA~O~D88%VPcd#dWh6iV@CymS$y#1MX zQ@4y>n$1mJoJN~W9mXpnDhABg+}nkIhFbM9st)byQ0;dxJa6o|&u<9Ew~?#{N?nW$ zGSWH&BarjJ2Mir)`tg`-tB_#>lYzlS_Y7b+u=J9a3x@go8X(+lLUhNJx zY*4vBF9FA}9+^x?Oo$Q*UvReXZ6EBP=E0&_+Anq!2_5AK_~igExtnDs4ZE|-3H7X4 zD60WGCS5I@*+QAB$V=)ZJ zA=5&%8g0~A+1{7!XAfC>j}NMbaKM%-m{NPXrF z3Hbo+>VccFkG+2JEeTRlWBVHA^>F`*dCN_U(BUrX@WDwUkrFIL%;#2xeFLWl)7(fr zN)m(Qbnq9*Y*uK=o!x`9CS?}K#qf=2vbG1OCP$MYLAcbQcrds=G_nh5!D<<#WDEDc zc?0vp!*9`x-=+&|W}$lKPfC#qEh!R`-w$Nu7{y%lohM|SK~Wm1xCiusG55;yA`HS&Ew;!htur8do+ z3@OVpGZc;}>J-3$E}BD#B5fMt5AhOEK15begffA~G(CVML3uG?|0)#lM>LV2;1cRf zw0P3X2sh5-!=XPM7B|{(U@Pl8;D)30KQ@ukzME5_Na*{vXsIthRmHx#~#9`iMDH#)v70g95}V zW5krPF|vXfCH+zI!Bs&v)s?P|zDpVLKD}hB2+>Va${f95f{5sLlB0a`6jxw9{Di1s z0+lnPu*6`O6+W3UXY_okm=b@2VvG^??B`b~s4s1W(Ultf7Vsx*Ig}QO!|Aky7`X(8 z*2lS=PVj{1*e8?tQl-2H#N%XmF6eJko$SG4;fWVKs`%%$tbak5Kc&mR#3d9i2n->2 z2Sx!tDSX^ExaY3s4oSufrm9aNy~;xwPBw$&Zs|x#IsyX)nI!atw4jts&Vo^5b~X1n z$Ozx!5!B%`$!kb%GH}YdKis3_tP}l$XY5e~OYe`^ncfH@YG->RdoXF}Y`2%Tr$E{x zAk%Em2uZuxAMNG$GQCk7zs*4PaPoQ>=cW;nLn>F^xezfk?V@e^7QTqtrhdcYVj)~ULmNvs~ypLW3$@c2&JUq^mws{zHoMX z0(V1vH|dS;joF3X#2so$_anW@@V-cMUeo$h-Z(_}L~rs|3b45F%c*{$f2cnVI1~Vf z|J*A`IJ6Nf>>pOJDEwH5cstaa4ps~kar8Y?F}<07(VOVa>`mT5Izvpsmp z_U^l*p=@UF2tA=6TH6Hg)lt=BI=)+|3&XXm|3DWf(CYpk_LbW|U{BH=g!N4IOTC%i zcn>Gy!DiOdVt3FqqP)qP1hrjrK1hnhxM`m?oWIAh8<@PX@;vB{QTL zQ!K4p;z7*n8-S(U?*k;%@;xNpa{1tGZ6s5SKlesPQ2X2hR84Q7z8i~lMgnQk;(G=j zpF@bBda*iP52ss5ZQ#f^iLrPHXh4=NQm!&N?KxmB(Z;cqU}E%b(SdUrJBNFKHYy4t z$lV|dG>%%)@`+&7k%{YM5_|aN0@j0g+GIFVe-d{SgCf#jYr>(3Q~#9OgO=*IAJ9N_ zx@$PYi-s{~lc1Xb#o6SI&Zd)gc8?x;33QqQzaD|nLryi{X>-gU2N;O(So2CAK>=0c z3@)IsNx!quz#4bP7;$s&O9K_wY7A zG10CY0vYZqnOGP9231(6hUI?>_kOBj-+W&ee-@Op-6_6_N9tSrLwxTnu+kkP%0Tmu zd<#)F@y_9E@~58YY^ZfX;?bh@kh^o}3dzoW#WUw|l29JQ08Itxi~}r4>1DIvVa!4P zS`k?Um^c^ED+C+pe&>;M6yhV2WO$QR6=WzOtt!Cfa+#C#>% zuI9KEu8s6(N+qdGD*jiE1$!#v#2?&vk^HeMG(zI5YJsF^swMs%t%V&YYy|#$dSutg z%-xXkzmPfjHtLxV74ah^U1+aUuK&XS^TrOeb6LV9>M^QpXEFbb#><#rZ7 z&Smp3C5?|WX*gKu4sMpbUi{F=X4uGt!vz`3xsQ-5`r%lnfFNNeRn+t7jZDuxKABYB z5dW1XT*jsJl%K6u5f7+V#ebl}H|cU67k`MOX*huD%1~(r`M^Nt$kQaq-6Aq{$7a)nJ*C&Pmo(q#-6|46cJfX`(*vmwT*8X4=AWA~_5?jqIQ zjY7r#!e>@4GnUGBM2no_%=yVosWO{6SLLHs*nd?;x!cnYIr{AW_AJFAL(uJs3a*8yk3rSjM>Lyxc_k%=%$d3^@GjeBI9lP6f z&H`gK_f=d?EtkmwO$)k(WMsO-z40LYlVkc(JvUS|jiDg@7~hPP8KlQ&ex{I=(wILQ zOB){=8RLUu*7$ibXZ&4$$oSh#-uN41*!b&W!T78Ei1C;CqVX5TsPX5>&i`3qJpaG7 zUol>b^$cXEf9UxyKop+>)jvky`y)g3Y@p{1dd6(H=L~wzpl1U;8|c{_&HuSkM+6*g zGe-UsV^a0@^Psm0^frgy<}gp{&CKQh&Nzz^KSWI)k5)qCnVI|th6QqCPp6vKUuRG& UFVnWURtEiK4w#i$DE`&|0ols#MF0Q* diff --git a/prismatic/vla/datasets/rlds/__pycache__/obs_transforms.cpython-310.pyc b/prismatic/vla/datasets/rlds/__pycache__/obs_transforms.cpython-310.pyc deleted file mode 100644 index 5bab2b0b6a0414a2950a26a19503c692baf30ac0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3726 zcmbtW&2J<}6|buPn0C*H?OA8D-Yika$^sL-V?vM!V6E7&aEOqR*i8-=t5JKZW~S}_ z+NvIVJsNXh7kMlZBtYU4t>i@n$rt_sB>sXNa)iEeTCTYOB3a(6ZjZ-n2q0$E)m>k& z-tWCvzxR@Q-6imQ`*$A={y0y_pRqIht3l^m@TfmP!^tM$l(UdVY?DRCrV*K&CdDx$ zv^H)1u53E+Hp98d-E?8j5{%oz*{<@6Fa?8uO|TwYCirj%z8yZt-ACqTU9YXeDnm3x z1LyVXX>Yo#_Ft?iB-eQZB+ql>4r$Gge(xq*s*_1SR=q?=fW_Kj1kbVZMi%lyt z;#WK#bnzTt?yq`&%spQU&rj1ZfK&X;1Ia?(=t(~ko>rypJ_QGmmXVvcVQX&P9j2kk zt#@LOveBY>B~1GLAnxY|oRgbM2%b0mVLuXa*4f$fWnbmC6xmS5T?0jxKNH?_@ZgyG zEi~xhM`U8&CJ*U>!Ho&oC35MI^uVdydgP1^oH-&H85sEnIc|qj&gkTJPeKcHiJ3=#f>N;alwgr9(7zhL*-9dcsiwieZSYQu(&6Pmj!^JS*? zf^2ZYa@PKGZtRHt6Cy9cg1>)%ZIDD_O%8*Bi2Hkf++W)b{k3<$ck9M2?x&fMYpD!W z1hLlDGZ;)tWNJ-@T&-OQ(tK_mX1ht0-bDEyBZd_+x?1|Ty&fiAKU6o{(`9-6)c1+x zLSC={x?PKE!bgA>u+Rb%@=J|a!_dtMlZ&t;QD*@nqp&y%3N{^yzB&zWRcoarUg8+&-5f39xN)m5CZgH_S?B^`&W(3Qlr{K*K3|U_pFX5NrdmHDf11s2FKoli1V(^cn<#Z`pXmt9 zxjF`H_X3Rxx%vZ*>5`VYG{yA76jNnfnPU1AXoZ+6C8iIdkC?~|rpqO!rp8o>UoJ6K zG^Qp8O#DT#RRwSrlAD0*FM#Vxi3>(&aAD^8tHyO{T$zw5F3<~bm3jiMUyXsyTg#)j zZi{Zh#c4v&iB(6^G(%vjg$y2hNvhLt818#TlqnpK{(Y*DSSQIUHh3rH0yfhtDD{nd z0mk0I@ypPRUgcVw4xCv=^1tBe3&ebyn)G?VPj-0~>L7CCvycAj>CqR2p0d5Xf=kY{ zM#=_GZ%s!vjF+JMH5|g5JG%Z^Fs7vbzaELCT_`d=^6S$hKfjTbUeR>m?fJ?}WwIM^ zsE(j;E8N)`?XJp7126LTy{-?XoF}AABm_#& z+fRmG6g-&jzz~U{0(`E9D3S&i5-s)=Uvd?ir$|9ieOWx)PsMjzEMF84QuMeV19f7I zDcOcKqwDX)YM7>p%&?Bcg2mH&z5#%SaAZNWqgs&&^^^yKP2|JTDj4JfC0;a3v1D}r zKbz(Jt@!_q^QBXRv{+lBb6l*HIF!8fDzLGb~;X+M4*>Z~PsBepoU(BwX)nZAJA~6}}Rv`r-Y(Wm diff --git a/prismatic/vla/datasets/rlds/__pycache__/traj_transforms.cpython-310.pyc b/prismatic/vla/datasets/rlds/__pycache__/traj_transforms.cpython-310.pyc deleted file mode 100644 index 3462dcc2b240ceb91733eb6ad46cf4b0da6e0818..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3408 zcma)8&5zs073a6O`q(&|jqObhce+6mt66K20=X2MMGXf%6opa1K@SQHdc+w?Tv4Pt zLwUVh?agVNqWvG%M_>B4^j3h^o_x-&K%4&FP|{Ly;+6o1E{4LA+CwzUtgn3d*oHV`e*e&+QZZ#XaJN=Do` z8P%5pC9#ynE-6`Fl)MIRn$v%_Evv9D?FG`cXT3lg_pA%I`Z_@}@#QVtg>%#ThU6UF z?!sVO{tunXCU7&JC;5L2v&{&q@Zm z0_gGt;RXeZ<&76H?k*$ee$%!qJF&ZprXf8FhmKs`X4edSB<vYHIn{qa1T0AD7lcxHm-11;4PPEgc9bdeiCjgluaW3{aJnnuBXO?;nC3huLs=^4d^#yj9ee&4Ys+rPksPd`t zY{qqA94BnYbEQx*^&om|zY1@k!H@sTF=K7RhmYBh!vD>VueWx?-vi_M(ph7za`C_f z^7sw5R+(C>t7H-6jEiW>^~8k9g#aHRcboY^evLI`B8fpfnw^d%_D#WVeb1O4=krvs z0d^EHNQ6{S2MdNz?j$`hS~6rw0m2aw0ofRbI4i(4x{xztW+1_zARHp3f-9tDXg;nO z2-HI`EQ2a81){KCeTVB6y*-F&TwV0;;eH4Aw`~l7Jtzgh2XDW$AG~x*rx=#Oh*Lx_ z?b3b}{QYn5-MuIHtdQz( zrV>3xCB$Zh7+uRkAF5R7LyBRuN0i}F%BKUtk9*`HO3Z3o=Rd9NBEAUt?H#3g%u{`g zmrAy5)nMHZS6O^5@&M{6a=nA0iX0=Tk;?v|rCONox~hYR+Qv}&Xig)k&{-{P!;Tyd z(X~*;~JI#uW8Goli?|n4~E_ldSs8fRnN_<+Df^Y93Q_F^+e)6g3S+ zsa8Iz2v;7L%?c(FI$FQ8kZ6`kS?{1g?X0UV02 zv8=T3q3j-syo=%dC+m%Py(qek3dpx0SOx&H$fspPkLSZ-Dh;mG&q&ce27un+d;-El&snPZ(ZH0k-;c>(QqdPGU|m z3v4>RKg1cvqW6Wjhtwbm?!-|&JiSZw^wkQTekLwXJKmA?hj*|V;B14_#R-n7e|1*# zc$zv+QIoh3|HP0f)0dNr$y?k#OBmEPsa%kgEEyk@!{OK z*}5dEH62qcw{0Qmof2p#nR-P zlO^TjNreI0ErOCkdg9N6bd?9$C>T&zb@1dmy4&Qi%!>R9itEk8<9)tbrpdT(Wk9E; z(QoLc<>k!kZRj0(7n6KvZjZ_iehN{y7KQ6|jZHPkrV=kkiuG-Fnkm`QAkjJPX{3JI lb^b2$!WrcS-6#NbqXWQM_d3qbZL4~N7tDlf~ zh8La5_H_UMcmDI=&pBPO2L|#f_`G@a_3F~2snqY;L-aEU54XXe^J0H0g;OYn(^xUn zH3cbYcHd=o)iNlPP8~|&49-50!dap`-i5O8oWtGl+|8eJMhf@fUYv(--%=O<6?gr3 zprVlM*=~|M+k*#}vd^aRP$F_T0l5N@BQcpP@hUE}HzIR2Ueh6RZO6!~@YOK)I&RT= zxVwhkRZ{rcm(pNSboLG0qK&P#Cn8lqZo=2KBJuS&&n?PFEV=>T$o<&jPmuXOK6_sT zc@xL=w#rOIj^YAfH@713&3H>}_O0BaF^(J$^_ZwI%cHTJo zK+EkhZ|;crJw)Q9sPE*n?`pLNWbTg1e1AmGJ(1aO;mC+ZkjZ-^*!J_^nlyi+ru!0+ z!k^Koe*kc|3E$vxJRN%J_LxQcBNADJ4sb8b$l8clgbzlLO?%?8JcREEJi~v32*oq* zFqgPI>PHfy{=r0KQY?>lsO6m<;%1X(hTJ$7^W^a)WMUNW>KJ(fkSDRIpU~$p`>ELM zr;}s?^6nV&Lx4OZGNKuuBt2hFLl)n|G0QZ-2;Up_<9^wXKb(lXPa;`%2FUx9bj3ln zM{`a~L?*@kfsT=S5)wRBjCrcmF%l;sLyMw4o+KiZEGolFRFQG2Vnf~~A|9r>Dg^rPnxPi~%kF;8kABwD*AH@%ojFC3d_>rX??1Uc0=ka6s zaX#V+{4xA-ekVpmbMHyKi)+!c7m$*pxKH4x@F(%IeTc+G>-ban)A%!&!A0dgjXxXC z`%HM3TlSiO)xRSj^4r5>vjYwHW)KOw(R4tcvQm)GN6jC42HJe2ki2A(~xuv^mmn1dY+_6gaf8C%93` zqD!cyo9c-}xATz<m^nOS{y7GYQRuO$yjA<3pe_>?HIgx z=AALAYd|XGFR<4Kg|y#&*m7qquLidx=LoemIaAp%!A~WIihk1Ya2xzNDqKhkrQv0R zg43uAD=35Eg#yl?9L}O{oP$#D_6J%bbKw$vE-)Dv6n23=kh+k8%WR>?@6j|%uMti2 z^O{z(v1hXTeocGO(@n|A%pRMa@q5Em@_X1Vi?yHSvGE7_k9PavJEvz)PapkSijIJM zetG-oc-5|vaq1aWVpSF(v&ZL6ef;jD(_5#pUI%-}q1c@ow44&>$R6X2QxiMm_97W? zb}h>E-UNRRymCqPq%uk-&AvTp&bhqjt^xSvJqKRP`GeCu+(DJm91Nk|?9#(7y>OW= zbW`?Di1x7yQ^D_sn$o}@ey{KWyB!H{LznnDhC8vd=@@ahk_lP`mo4z;Fxi1oU=pLF zRb>jA8hS5!E;am{*X-IfPgY{-kg4JbICO7TG_KiAsJTP zbH|gFLir*!Fk1^2iQg^MnMA$JYkY2hAW)h=qP_r5jY@!@@CPMT<`C+2^^&dCO}&Bg zv{K}vj1uZ&_fj6}7q&97z#0%wl3D0Ny~1=t%Cl%d+y^6j&j!-{KA~~ocS7fgNg7&7 zFG4|4+jF6BgEn?u<3KdPk=u8ON57BSMcWmYq8|R>(K*yxrncp3plzXNR(?>yYIqv9 zZK~O@1cO}7@hB@Hc1y}-0?wW%u(CTSJ8#&gjs{?jwcL4wEHF`Mh%|_1SQ@M{m=pTX zvYsktcTkue+*k*#DhAf*o(tlNnn!n_JRi-C@w41#ewGhF8P|4gG-yJldu(xcnGgIf zW<)nyB@_hZU8-A-$u}CpGBi}u&3VvEFP1b``)Ga1HoaQW(7}%eWi!`3rwaZcx{3O@ z2ZjHjEJvYMq?}tp4-2*|SYt`3)jZQRptb2HFCEzCXwXpR33Un9f^CJ_fD+-e`h6k; zxM>joKEg|gH+3`!4>mQxn?iB4g$08`=w!yV6^*uqnpGQ^>YWXm&^-szM=N^R>1@Py zrWCXd3#@0k-`S|uB87E~xvk(XmN96xaK|z-D-0+Jku$TiXso7NhBIfFojEy(9+!=J z+)|ylDYU)p!8U;xjY*$3A zjwJYfk@7*i`S`j~n$uW$f%{9OBQa=7#@W#gu;4w8F{t5nm>n4?uXBaWxxPAB4VgT? z*f@7?BzVvl4F@!b-PP>5RRIxTuuh2O0_}t@FxP;tN~gjOR2IFvU0tDDBjb2aVBL_V zP2Cf1pndJ^lEW8uTGN*Kx81ycj_36hWb6t85eXX*TNfd7C!=zUfVE)f(^eIbl;evk zh2gweYZh_kJd>yp_RHFQy{#ZW!^EKOU_EJ$r1kbq3iXPdgYjr$PKUFNX4w{m3zF6` zV99Cc5i}OIp_=Z@L787+vVsV>rHmT<5(A(;^N84fxU34>V+Grvi1fF1N zrq|j#4sB;62-Yplsp^!aCajYbwm5C=-mE*GSdO6CUtxx(kJxHlvhBIHvV$$i6l}uz zp>i^~nL=>Lg;SKFo3Naj1Xe^xUuY1ji3O~r*TTh}I|S`8^cq|2SXX0-?!l%3>W^L_ zO8uVUl{CDqY!DNpLC=-_q~^I~5v>oivJ4qsYAHO3#8wswdIQ^VuIBT?vcdKf3kDpJ z%eIZt5WFp@fY$S{(SSqIBrMlVKeoJ-Jw`%(i~#3*0=*NBg!5Ib8m!n57}$>nI|)WB z*|Y(B5Yvq8Fc{s@OPT}dET*=B(Q3X#8_*HO`ZTGTr0mAXQ-VAk*%fdK+aKGoJF@)^ z!4eHCBb<2n{sVRiHBG$b@P|rO&zjT38B8l%g&f@g{-7J#MP(Ou@SxYh#n0>Bq8x|% z%C%z9tx>;F(lIar7e$|@(173x4JXk^6sn1pFiM+}w3RKxBr%KaVi8&gOHSC)pu7Na z*T}snY>cZe%F^o8go`AN?E-~~r85fzw9tdr$VsFs;T)?Zy<$ga9+A|^iKaH!O4@elO3;WM z&v0QQ5(-+cn|4*7g!(fvG&-BQjy5#`8Bl|^zpj;OKzrC0@HR=F1AVKmoq?Eue_2E| zIB|&K8zlrzWC4IhY1@R9Q5g2SH8^IJ;B`QTz`F{{ULK%HcF^Ew^)WcbkI9m9#@K#+ z432JNaMCfIF%b%WPucqhemdvpSvtfXSF;QI{}XhWT?W`?4ZEyhm$h*5R~|bvqaBz& zHGOjCl(zra?7_o_P9~k6{hqzzy~RHEio{F4DcFMOWPT!fLY0_XlFC-6LQUILD>O!w!3*5WU~Jp;6bri6&r zPo*9KURM$i#=XnAgK~x(jQe-$JkbBi>FZi&2Kpa4eN&A78%`f>&eF62M*WI&x5Y;N zoYVI#r{{jc>C?;U%4?i{M~wbGPCpi-e~Z)aj?ur)>G#FxU*&Whqc3s#T#Wu4r_&hy zX-;p%=ua{_L<4HLS=G$n;G;{D*OTl+K)fXpD=&-qt3+%_kpDZ$yRkXCnI&9mnzV3cYFYkTt zO;9Wv3jE&x)8F_0wV){f5QE9z3=D2S3o%j@qL>OnM0Js;nko{s=4zg9>XO#ol$SQs zUdGILSu^Y9%$%H0xrUcF^O8=x1+Qopy^>k-E|?d*DRau3HmAKAbHn}#++ zGxQ=&chzO3TOzY$?vs@H+8JpfIZv}fmR=Hkpy9=ua;Lh0{}O17wJwlF(E1&+)KbdJ z$A2(9JEF|7UFVoq`bUPbVRxc{9pR2^hav6;3_HFPNe0L7;jKRPuinFTZ#N*0NBsyB zJF-I>;r&CK^+Ig>1P9R`Ww>uM+oPa8d=K|oaNrOcFVhG8fJLBu;CCV?@TD1}<5HX9 zPT)r@aBR+}dt#Zdr}3=AG*8O|vRF+8=G#`9i|d z-uj^4ux{3GZ8ogU+C5&_yan8KYqL?S-{aGbyLYUZvt_OS>~7-&o*yX{hKyEwYdhvG zt9*9rUfrry+tpURZLM!LHa2gyc&RF{q3(C!PW>L7;F4f#wX2Old*_M<&skTkpw zs0(;ej$GUC4eTD}X-1;~^E(;w3dt`A|I^SkXyIQV$RZ^|LvX`~=p-}D5VWjB>Mu2D zIunr$fP5W)Um`CJ`l{)iy_FCMSoqCPlA1c3>G*L8aMN*u;z8TmcufcoPA@3O!z8^$bcoW_^zOnNj4t)0r*XtYB_ErmjUsBC$K)k=T*=UPN>*nU2 zYU6J8HjcH0I2`l^Oi*%pnMR*sip@17He0{Y@0!ZL{P&ev$CBUna1 z5fD1GS!n;f1f8P{m5=kkP*2c_Iz*o;@F?(qsQzBPueOw@3VQ>Xo@!jJyu;BxM+Z+4 zgOnD}f4#l37kG4q4V*pd_YNT>R}Nfz<-^<6t5st60aaHZWJ5r2rz2Mq5C~V8OTv}l z0bMzZ?A%x*fR&xV>tBCY>``nstt_GAw>*0v&i#N|BOr-=pt9x$9or4BS575gdSv$9 z(bna;*!RS+Jf3?~P;Wu^V?n$P9WO})8sE2B3L|jDu@~`VJbDj4H^L>$@xzD>q|*U_ zC1@ocP0IMhW(2aC)%8uo9z3uTk5;)Z0WvEA60oscVyf89l<20RGc$>YnrT5aubDIF z{gWFX+U|hX84DQKT22w3l(?C7hm#d!bErzx`MHUrcg6S{3s5WU)6SVxAOb57Z5>+p1$2?}Q0XbE zdXhQG4pB|{7=!ocPK;qrLei@ddaM!zBcrPVXsVAi0c7_9EFWvboCLM(V?ADvS9-C& z{7GS$1%2%-z_$|F7I`$!?Flj5(6_uq;CAEHBLm>(9p5U1s|-GtCKF#JxzGI^;{%_FKL z8=pl>xHc>i6KWxen)u$;^Hoy?w_!QXvy>$Iq_Hb~%C3&=#NZs3Wp#B(o!%alPaWIG zyA;Z-Fa%L>XxfPYWupM^Nat{TKnWDWCaMN zm<^>AKwtBAt+mqLsI{)bQ2|!~l}BkI#8)N~LwQpDr01++QKR9Yi?58_TtwsBIJCXK z3pT9ySde%`n%Mz3swDDZMN~tL>YaMcy0dw&eYaU}?M&K{Rm)_Fvx?Vc!INM$+=yU# z4~LY-nnznm?iC8(9hva#1KV-!U6;m^m2*}kX2DrRW|K5j$MKaw1_T=0gg~_nG#E`1 zM)TIq^NNy0l&DHZ9jM75~zj8yERJ10Ny6W4Q(o!P9mhK39kYGWH{2 z1<02ed>!bQ0-|+E}BN}H( z3F;EhU+>ZA1v%J)Rp*n~}o;=qN)0>JnJQjwuf0hAYj(xH9h?<@UzDPK}wN7-Mi z&$Ala0MSMn^NeMYpkrBF7hhB|2g2tBd{nv4E(5HC+yw;@Tp|-C0!xWb6ktqz2k_Z( z*-;r^JIjid#nTliRVol$;KzOxGEs&x@!ifux?ly-6-6g92)h8C`Gd2vNqmMCQOr~X z79^26k*weRHj#+gekg)7j5Bi9l8YT->ZH9dL*{VSlJtE&3Hw5I-7)z$x~DfQp5{M@cfM=M)glA>%Q67)z&2Qu`qt%vE&qtQFvJKNrw z=|OjoNcv*F)Hj?N)4U?d0olEi=kV+;gId@%d~Z1@r&Fr-v9XU3)@ zC<}PVss5(-tLm=ms_vFsw>BMkp1$+f{*6Z*=Og_1_S1lm^YH4gK-_Sc({WgZxnU)8 zJ8o3%RHIs_R&j*Gs;m}NUahbZR%7+R1+*n_SmSl<8wJ#~XbjLp7HtJIZqYVC+b!C` zc3K;|*o19&vq{@du|2k(W_xWr!}i&BKRaOCgUqw-A$Hid53?h-JsMQN_9L%X*bjmV zI|gqH-sA8-`s?}|mCguvd7W2z4dj-dJMBz6D0zcDcE({Rt~ssQmwwxf#F|MJ*R#ob zv$@23BIeqop5lSfMy-3Y=P_!i=Ef_i9*vo|%0nLhWfCOkf|;w?vA$wb6s z<1KBd=~-`?`u)<10P|W)Ur@urj4=Hs^?ISC#yhNOl!PJdRbU=g8{xv!O_3uADr|R%C^o|Q6M>GATSA7kurN_t zps9iWE=kp?f?6}u$PEXY-~a*t;sI^#Q)4i^dI$whSdF9DfnpbmNfdif>_xE;M9Wp% z5MwYP3KIo}{^2q`%G-wNZrJAij)maRn<%_?qD=reU~Fj3g>#+8S=T z*&IzZM1XWtriWA9ZD*7%#_MVBpN;$Q$}s9|L}Q@`A`FL8`}kL{;qwPv|HWRaVu9sB zx;*y#Lm5yN`HIEiQ=e4=HVw5IM?V^BFe z!6B=m4gi9VDeS=Qz*Zb|%#>);{)gcr|L4e|X z>T5;@!uSyD;j+R`rl}+W8M5YQBU&Y8uqyf^E{~`54k>HOcP2j-oBk5{M+MipBm_M2C*_m=7UyQknaDYT+-zl~U_hGe%sKOO z~dlI794cQZ%8E_d=fltVd1 z!w#B|6<9%tI<7QXJhj=_!JL_-rr_ano}Alcv95j!^UL<d;<{s8c!Ies7T yY>wXpJfGvg0De5j?*cxZ<39sFo8z|uU&!%WfS28S^(xHpo?H9obzV94yZ-_d)%BeSJzgD0 zc~KVRgfy0ojX5zKJ};Z1Bu3t0R@Y=;l*K48CNnU`#5ge4W?)Q+HNcq4z?c+kfw3+F zV@j+8#&ib8v{(;}^%)p5VgoQ{GB9SvMqq5nz}O@<17kJ=V~f}djExx>+r)NYY|6l> zh#kP#oPjYXZUM%Y42+#(7cjPFVB9Kh1ID%tjNM`$7~3;2_K4eoQOUsAEA|0nM+U}% z*bj`k42%QfATVypz*rQAfUz?JP!2n!ewXJB;2bHI2c149WL7{@X& z9MJ>D@eGXf;sP*EWMEtrtH5|P1LKl-3ostbz_=`~0OMo^##_br0^{*!PP|R7c_A;} zE++xMkKyxjEuP=O@SP05UrwRqU2+}Z56Ef2cgyvF?~yZrKPWc<{*asne6QRH_``A& z;E%}7fbWxAUdUsdAC+72_I|ky@CCUY@W*5Y@Wmwj0Y4${0sN%A7w{M5eSn{mM*x3G-VgZ8 z@&P^4ugC}S{%QFT;NOx*0Y4)j2K-fqzsB&_SMB#L9{&(vy=X1HTqv{=d z*-~wLp}SfxACt??jb7($wo~tjx)RMxb*E`{Dq#l| zbW@S_#!AJ);Y|wah?~C><)t$zQ6niSmvQD^mWp{HyF6^zW)a^=3)&=$C zV8jX!YoqBrclT7!>F@7Z+mYR8EB8O}K;_ysG|QB5hms&Y2#4W3%JY69hm{c;^1Ely6XgFmJepqn%aPB7p zsWC(?SX_6LS3%t2^g1AJST}FFvrA2^O!Jbgol(s$*rn!upzY2yJLjcx66G6|Y%SK8 z>_o*3D(XwJRa;U`f$#S_B2GOk8;+$`-J-%rs7?b69Q;?nQwe_`!C$+Kn#;MXxof#& zx$n9CO8#( zx#{H3qNNx@t0J}Fr{Rq`Cx5k|-%6-6sYhW_q=XjcHDkFTMnw6&-1Z2y8wJ)F_b`4W z=L}uWJ)ghgmrsN}h&AuzcuTjK4BCb`BQzF`q;b{~4rG1m(x-UUk9NDQCPc-tDp(Yf z36k!Inb4ItE>^U>R-V}x%4?7)xfoQ>nLF`z}dkRlJqmTd}Rm1rmt1bzWAQ&JG^O9jEoKwWU+1KEax!5rK4z zUgI0Y0jCtjx9->lOSMGpJoAV7ZaePq za?7eahZfZcdbnOt6vVp)Aajew>p8U(B`-go>37;WvzK9nmcQ%iOgar`z1O1Fe)eJ*=IV>+|hS6B`28J=k z5Hn1knE={fK=1eWgnp=|6yOjudHobZQc?Z@MB9_HFhyY|dZKmFD# z=U|e2%jn9czGsb6-qW5Ff%XIm??hFMgeD4n654-?m;VtUrDNv> zj>%fh@Qavi_99){(`YrjwdG!iePOt@@Z$HApGuM-YmQ~LYKo!{DO|Jq#L0)}^X_EB zYLlByo=(kK?aFzB*Z#&Z#&d&3_4q~Cv4#s-V!qVJ60Xs-+_geBEFo)t{}{8Zi&XuB zs!;dC7)A2_2?-S<{{t0>ej=-3VXr{ zwWpOV@s1W~zh5VJ)#oa&rtB+3F$#`@(gkOYv(S6$lg5Udi?P1!V-JqGe<6fZ_Vs;m+YY}Jo!9|T-M^7!VstDC?pd02Ry3`$K zW?WkJmQwA*ApHxt*{L_ITDMhSO|IgP1D`bm8LY{?S(q{4SFsVBEMd)#p+2@|QRQlX z2DRVfnuTVEC|cw-BhwYG(MYP2CY*8*+UU1y6jsGm^$_V2e4ED&G)55E@TyJdRof&x zq9&J@*;lW{jGE_GJcK!V>+vu>k%O=Llmt5+;qsXMa$2S!!>+ZO=cGDJ?fX2A&1VNM z&_{u9lP^OHG+Xf0_hde%zE7Y!wn)S|;N8bi7b8oIci+flfqn4ZEYc_lx~~tIBFV7!(mkh( zoGEZ5aI}|weJo*1m?R;E1`@|qWDh*6PmAaIt7$y5iM35nolDZ>OK4~lPi`_iTW|Qb zo~ZH!Dh5(T9iTxE5;#O4ZREMmYOf|NOVuclOnnw=1QhHF98P%Y17|uzG<~m?BQRa8=E4J zSz`jRY6|5~p)xV+z{`3OwTW4C5=gt}%!{li@pKhB6I##X%b3L@cUDuyV5*Qt_4t#X zkm5<`N^Ek!77OQ&dKQjN7%RsctPN?>)k!bwc%g{|B&Q+WA^s<#Fw^)yqBY$@*Xs7E z&Qu1xHBb)qo`bbms`bb!qz?<=2k{VEjL=40 zBk|MflAnE{^PMUkxx#LkB%%?!p>vZYz-P^~LtsHxqHy-{ezgeqe&ia(zOe}S>hLw6 zvSlp~-A*h`;$|8}V+{GQEqUYXMzQ?WvC+x=q5;kNW>m%&mKZwFG2elg zaAC=Z)*1w^fPU&@=5NUXVE1UCjWZ&oPGjB;-JZSTB)RuNnmkE}nI}^};~O<{Q=dg~ zV%kAyaUQi1)f!IL2ttF%w8fClzj(Z!Qp1WjEHC;YLnK3BYVWAzxtZ>0WNYTSvEXqY zY0n!Zc@dd^lwQ-wvIlCngMBSg<9MvyYayL&3FL_rZ1RLpsafPlW{fQXt;~`V=s!Uz z3iny^zZ(xxNrtnj&PD#7Gdlaqxs-3Etv3&5)RQLB~lHhpi_#7lKpB&}QeG@&c=e=F7yR#Inrf z&9BgFlU`X(#Fq$3em3BtU&Ljo7?qH(>~X1rRe_;(Wldg zR7m%A`#+Py;K0}WeZ1hTZLXDT=Z#!G*EX)?^Et;$%xcaG#3!@`@tPrDCqe*yua_}k zU%j44+*0un>d@{;peb}JzglpXoqiW^$1+Y6G-;Q+wgfK`v6Oe%CEOMLFfA@I z_Y`8xP`{cZGsZTgh`c>f7<$Uj4&=_8q^f=H#KJ_fte#H1ZhEF=!4o!O=2Qi@yU`Bh zNM7u=EX9OMe)eba**A$N*&biY`Z)U`)cPM6;hV@uqSVu0Y$q{+h?R|dUfyOPF@ZaJ z)@pVf?P?GoyxQcp^%^$ZttOrC=_IH01zHL6t&x}`LA!#-1jPfT{Snl@#!0aP4<;V4 zy>1P?p{-usrCMJ^T??(-9#G<%f+TYvDfcjD?8JM2Pc7k^k&gxs!DC+wQH~XR)U%dp|r+_N+;&@1^11M&Q*4ZmMQ4#m)FL46MG7x_BNS!J&aE zAe4yzMGD-fX`kpNYa9gffp9_PxKOoG%X23&x)$HlK&YoUcx7;H=xc}nnC2XhJ zP0_2j*rC0&N!QSsnHmnEP=Z(^1g)I9IwPGLZ>V{mTxbKBxx>tpb{*JZU?m+o5&MX0 zto#`4!mB&8)TmubF7IE0$~rtHt-L!Lp1vcE?xYOKKe;IYzFnFD{g8~8K>yI&&~kl! zB;sj!n;R)|z2MC^ZRUF%>)a}KlGI|5`d#*p#wDy2e2Lsm(tRY?tR3Hlmjv(mG*ItE zt@?ffI|-zXelQ%J(<6@6t&tPd?X?EjoDx1l{Q&j%Zh*w#STJt^=qqSb=z9Lt;Nf>8 z*GRL!SXb>@mP%VgQ^#NADbx?8jq6XE<0XIL5lKcOX>G%NBYfkl4HYUE1() zO5m61@uD@s78TluNHDscf=dKCcJnA;$A4H3HWr%&rBBOfx~WMjIDqIDb>l&%DKy-47EfLc89V zH6FOqx1Ouxk{(ll6!6m+)y7>fZ6KfX;r>+U} z38CWYQsgq6Eb2tilnE|X^m#VsKyVAN~fyp z&fp{h4vgSl2=+NSX^rF5$B%m}#WD=9KXp+Rt(;OM8 zpUs+_`Y9UqrwI%kkj+-ZvQkbA^e~qC8EXA=nWJ1Jl-0~pZfUl0=>i9ZYZq`5+-cNP zPIJD6hGR$M!~MgdCOi`M8+b|Prm%AH>{H-)=$auCq&^%oa!8C?V@-8tD$Hz#`xSm> zDy$3cGjJFkmIwX9;ou{P7}?+JKp;cPCL+z0hESIY93>F?8V!04l>GH!^&Ax_0ycqg zbJC&L9s%0b@>ZLdd>3Bboy~fiHpJOK#f~H0cS&&8+dyPKoiUZ)iL^6C-P@vDL?1y< z2{H1cZu=N&Lk$sng3-<4Ao>e8gbUHFl9+yYt;Tp;ui0zV#G&P;#N~>_tlmen;$(WB z_C~QY%C`fM3HxZ~e2Ab}w77TLZ`_Pb$Hb3)L)RowA+*h{cP5I$%d+?mN^~pC7-muO1tE(LK^p?Szy#66^*!u|NV?hm!YPJ$9;7p4|UOOohf9x|rJcQ*O&L`3+Rz@K+0Qn$s zibW_yj+9TKXzWgU7p@n0J-A$fl0u+wVQO_RcnWMx-*eEj-rwZ|NPd5Pv~x?r#T0hc zh>F$1D$dG>q-618%{!2Y>(6vjJew~Q@ha$%&Zt!KI0g?RH1V=oiXYc8N7OF@DZzXu z3YnCPc!-ly%MbgkAXiF)1+t@0n^a45q=G`o>J0e*teHs-p zdeEpr?AO6doaxf#myY5Aj}AG;)VFR+*T{eYdeReTqnTlST}Y*ooE(v$T5=j)zUu7` z`&T1EX{2a4yrXiVDV!DcDU>ND;nVb#$|7oI@jwcRwh)Onp_M12K5=49{25dy2<2Nk zwVQ^VC$N`5+TgQ}cS}8Cj21Au+E2}X6(BZ7j5)8OR$K7tw1>aFuQu9V=~CyDqKw3sbSuwUpa z6`WRz9zqyazYe6>GLVX)zdZ7L@bD^`?G5#owUTneY=u67M8q@{B?jEGe^2+KbVvPX z+}U*}^oyQF)ucP6)lt4#O6IWHsFwMKGZgMLjsZ2CdTa5(;^Kk358lPI$Gvyldc)CP z|7MOm5?s>34q)o-6u0Ex$#CZrcO>kdqO0-beS5c!n&&0m3w4M1K7l(O^cgwS?wWLv z+ntD?3wAd*tX8kR)U4xnOjB{Z&qro)>9FPAA0&PUofAb~jeZCN88&&54yWK}5LR(( zu|{|F-M#&%Dl@SU1%t)U1!gunkaj38v&G>-r?oO$EhLXfW#S(mboM5*)qSsW0w%NZ z!PFBbnN98v&s5;&Z~D$vWVT#PKcMj1+QiQmWVSJBgWSDSZ%U@@7H77;KhiFJpEtAl zUD3TzPJ@M1%4~Qr+A!Ee&1}N*P8)&ld?3BF9hccCM5X<-Ow<4Zb%x^(wqm+#IN{? zY$)57#LxfO^$V-`IUzP0P`qvT4xMb}%nHsp^}6&EN0f5S?077i$HbkISn|%@9B`7C zvx=T@aM!0Maojb2F)pQ33d9|y0v+MIQ$dduvMO8`??fyZl{N57Mzs}~fru5Na=YHB z*}xuRU)CLs#DOTI01Q{ly-36zVsG1Bul-(Jq@|hs9mb%dFN_T^HH|;L`0&H4zEc|bE@oQ1KLU$09PwVnCRup?mmmWM2 z_OMl#9mXyXc3|xuel5igtJi2>s}60+mbd5?xSQDlI#|QckO^#sS<10%iwAJ2u5H=w zrg&u&%y!7H+o9`hFn7213`h)e5&3I}aXpTkZ`UYv5lSNv(GMNaeat`M4%cwGvr((L zMf^+w+!aMWmxA*Jk1ISX%R3V8!Qj_^7VtYj3)5in~zZQfm(HmD(Il6dMkmq6L_A$I|#gs0G&1B zgFk%CM!i6 zoVrw>peM>Ban6O49f~4+MbRsVmx_WScF8@LdD~OZ9Iu^z`sk@+M_+&JWcBFNC!ac1 zd;0Z9Pn|yT)a#!-9SH|^Qo~&UZs`ba8R10p1LVZpp9C~>lWR-Hq%oVHE6r`2+cGyk zw-*0S%}viu&aDGn&rj1RAD!++|b+x{4LH+0B4fRhv!U`Oaq#8WB9ko?_>D4 zi1JC4P0g(V{wT25fJ3FLTWJOLN?~X-*h_VXiU$r#Wf-4|A>g-_0rGznSZd|7uPf|HWKy z{AY8<_)q2rKQR}K zKQ#9n|JXcW{DFDU_($fV@ej>I#eZNPHvYbOhw&xzPUH8@yNthQ-fjF{^B&{xnD-jL zXWnQ0u6e}x9rJ$Ux6KEPFPaY-UoejvzhypbeBOM-_?&sn_)YV;`B~#p!$3?H{?Tt^ GAO9{yOU0*x|(iKST#VuLb*aPF(>Zg-C}ASBA_ zTlf9%d-uF^Z#f(sENFPX`l~>e7dF z1HbF|{e1u;(l+!5tf@WJZ)yknYSD;|s8WqH?xydGc(fwIx-G)UiM%kd>rN!Roddm; zLlu<{G(rA(dTXv0*4;U=>DAnzy6ptjxh>zByY;ne7q4-r5xHWnAv}o@URkxI_R2Zo zb2%68xO1D4=gT=RLXX=@pKa{Mg91bBD72M1ePlF)7|;dT{@k4j6RjcxQ8WZWBp&J6 zgMFFD1+_Ufb5R#L8bX^%L_duhRE@Mg*)d3sN(@8$?-v(CG*`|Gwp<}yLA&T(s9!Zk zYMGPmWk!0Jp&ik7$a``PqN(uCjk}EF?=GttliI%C)PZ)hQ=`h6PMNWNwr`N!+YqjFg9IYhbj1@gvZCz} zUACp=1)Z+R1jSi3|20vSOG=~F(6rRsv7lPl+-Oq-a>-ic)|Ok20QufE!bmd*jTHCntQPOst&sz(nuI*aYn!aoSYN=IN_<}_hEi4w>z)@@1UI3E9s?jc>?BAZ>s zPk~YpYQ#9zW(2W{pgXUSBPK~ZLru9j-?FDPdfbLfxwT~HrT$dGiENcJ7-a*|d=2q$ z%@&Kj7(UyY)`qVM$;ssE5I=#HB{A1TA+KA;gfXD!fom~^sxDL$!U@^5K{*c*X%BRe z3)?WNK<9lR_qO@E_PSnS+~DS&e5z+ie@MYCyZ4>ssyhJ>>;A4Kx1EMmSRx*AcMv~3 z+32%_M$wP7G{ediCmpl9!Tt6(sV@zEfZ4&dOEHpu$sL+dB{Ba+~&Hx6HQ_ z^JkZVEXS8CP?B-~!5sabc8ocK{L`^Qd>TbEKukjvO@*G=1hw%vOSXjdIJX&ils6n1 zswPppn0&6mU%-TtPH5f~7x0P;IQtECmv%9?qUWw)NI+;uso$YMB!!%b@+L#YD!?Ei zGsLRiHe_XK8&eSqRLHYwuiT_Ip;ZjZ)b1l^;e}-l^rrEC%senj>yS1G5@p|Q*Dts0 z(VJpk(rc*QAF~3QtvP9i6Ep_E9DZ6-!0gJgItf_9h=j%#Rk}MA45iB{?#&Rv%pAjw z2^q%y$tas6ON&R!jsW|5PGGk#<*IlAhbLYnF$xh6SB2MTxWWdvdBnyY`iQg1WO0AE zjvQl4nHGrN1(Wcn4G}(CG?du5a-Nuy2%M_lnzJgxR*}ul0nb`s(jhRCqm^ncR z>_Ql!ky5{h^bDl5_CKOWaq;uvBZz9n0*TL%_#BB#B-$}WT&Ch@ zNi351JOrp=iu7)87Mb#9sV@lt@K~=uClY_E_AZAym`yp9nWMV-%rsc)y21x6{~U)d zV@#JryC~y_XvN|F|0ttF0v*!$FbNE%ZX6PGbYSh{?!A`B9(uiA`Nf~N-g`3{_tuqfJh@8a^6#%bQp!)>zx%FIYRj8{QOc9C-#k{zTYuVl@2a5e zL|x}q6%VRQKc0W|#_r22Dlk<4R(t2(Z~yUlrI=0P{z=4r`go(1E^-4Iiny@nics3V zciW8%!sVM~*LLWC1o(!j#fGyk!3--{rPQbR%Kqm3LR-CZr8Jz>uD-JNb$i~vcKy{} zB#+Pnlt&^i*~!WO9Meh4YRycRsG&e|u`tWT4OAA#T@~wt=sJlh|H$eUKpf}xCtf=xE zLqxKlH0W&hQ2cU$iCHGPIJI|$nhVO75d1O=O9dNnC01so;UKQ`a6+7J(Zh! Kaq5NfiGKslgR9a2 diff --git a/prismatic/vla/datasets/rlds/utils/__pycache__/__init__.cpython-310.pyc b/prismatic/vla/datasets/rlds/utils/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index a2d7fdc958d8290f4f4418ba55ae44782c3ea1a5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 172 zcmd1j<>g`k0FJ$a?jM)CBttwvi=2T@$=Cj^(b9-gGs_-d3b5pJCU^$*;CeN`v&(*WcVk5VV z$}B6SB?cR1VPSiqLH`q9vKn59L#ao%m)9hLH+%2VYh#kgPqI86hEj@!C zOYA6U%`@$iQas*y$zo3ARNIm3g{9W2Wu0NnjoiR@Jo}A{=Pn%yTfFAh-I`7HY^sOP zJS(}{V&k|5PPwUKta`Zy0qy@$lPsPoJYflmt8|sCQ}1YN>YCP7Bc-cF>P<~d zMA{wXZ)(0Sf7S16*D-YuKrpu4(DlQ}@oN}1qhXuGX0_?G@Kq}vi`C^=z41U5Bj`Yk zA^GHMM;C)8KPuYpBKI3Bj^8+X!*h;){o_c%jj!y1bFe;MR*cwWO3mXWy1n%c$0U$rrpTCxaN=PpH!|}U89$~jvVG%Ef+I_7!lhj8AlRZ$>_Lf@K|r~ z2sLQ%akfcCOvv%j;)-cXXzr6lj%*iRcjF0(mZ6&JxN51hs-qfo%471T*}QWMYOl{Trf6q`*RIV7R+q(8iutJHms z_FeDKK*q#FWuYMR0n@?s4ur*2 z<+gTP2iM~Rt_NHUcv#l%W;cy_LblFg^pk^@*I%XaMg=z2)jm+}Yf|*4J8$$fE)VT! zk=ss2qvaV_u5_v1+%W{u;7P!@a?MySMPAIGYgZOa%)e z7dNCXhhCG^9%zE}ZJbNrW25PW%SgH7LsdSFZ;vQ=QfaiAe@5ccRqiOu*hpe0UZk9Y zBz=w5>I0Q@u(MQM=BtDK9-AjUr@6qKQ?C#aqGl?cQcpa2HDR}kF;`QrhLmR7^bt+a zHy z`=~Ipvd9I;Fw}QpLv$L(ag>Y0lt?Wm71&F7J%vZA=4X|n7H42GYKzc+POA!osW)9p zoOfgW`my6kUV7i5U*j4L*ewYw- zR++|%(J#|z8kmlp!SoGMmoy=n^2f9!;v|xfNymQtDgKJOoDo68j7Mvp+o~?CR&S7^ z5~opy=`j}F#$v2{!Adeoo7sIGm870B6iq#-YO3+@|2+k7f#&0t0%Af=0i+-u zpjK|6Q2q&$z=VIbr=Cgvw@fwlBjt2V>Fj#bU-p9)-|lTxxt(mnMs{&jkS30^<=5Z% z%9)Fal_blSNK-!Wd8+=6g&^=^jd^4`9Y2IsX@>D+$UW?4Y~N}RF2va{zW!$U+@;t` zeyd-5=_OGF$)X{iBg#?~k04i>fOv*gScd*ilEWzG#6_xjhLW#PG7#Yx=~W66$#J~9 z-|{hgki;@1p|CVd8^;o6v51yBrkW3p>^Mru{EW%8F!VM1aCRiC&Zy?6hB2mz-=uzW z$qo-UNiNx+93E+Z>d17lWXlFpZCTMI5~PJBNd_W;W!X^g($WIAz+M=zH0o{TwpxeH zV#Y0VO-E^_i*(53!!m6^3|JOQ0G8;U0eR0mAD~Oi#^2e%2l$ObyCK|Gi;GnIhIU;9 zO}iHOk?XgEHZ*f!FSxz~tt)pxXwM&$D*AY_WJyX_;61$C@&Y%4Ms^~*ju9uhW)Maa zIY7BbZj&EQRCd~-i@w^Pw`vD1sPK88JHkF*I##rSI+71HF9U_m^eIVW@XD$JrWMm~9 zLlGR_1ouD+eQ&BIdrjwH0mzuyFjmwffH-K#v13evJGDi8tU7h{P<;%cUU$j#T}X&< z*5GyxWD`PiBm;xTeOSR2Hv;$mCZ-ZvhwT+^lPR>@E#gWpnA;&EN!D~QCMTJ@+G5*Z zrj>cjEZ9zBjXej8mk2Ma$$<_maJuxDG5Tv^&GDi5ui0)Bo>r6l5GrE79(d4?#Df{qTi0;2ayPdwi~a{Oxl8{t!zm=heb zj1`%OerfO?k&=X%Zy&o_EDky+23#6=H5D2-P2uv}?$ymzs9LXU`P=W5zHl$(Pp)70 z0w8yRC!0xDoafSg{qV}~|b>mTZA#&0sw~ynF5JV5| z7*KHxFIS1}s{BydwnZw$HTp;_Nw+l4rEGUlBJs~bwNw0+2TOj2BEF2|GMwOCDzim1 zF_Eb z?cVH@_;pZ4f|kkcldx@L+96#mqimDmq9M?VoTbWyy$ET{VnrTkWpM%T$$F@gucYff zG>T_jX>zto#kYu-M8>LUBEP9a0yh-s)2Y%Akw{BPskUqkEM*rVKH3Xi4frmz0fYw& zslp4%vK-k*2H%YY{Eo^p%R0-z9s(Rs!#^UcdPhO21`o};mF%Mt0m-TkJ#J!=Z;H9n-&hr z^00j*d8akSdyx#Vvk(Se8%8Eok7R?PHl%rc**=au)$SSOGhqm4oRl!EvDCz_fG&T8 z0~jlE`vqrhpE-VH&$}gFO32#q2H=W$ux4yE?Wwp=4k=YlKmN;J=5(_1B-v8NpRBS- zXfbK2$^zukq}l?Ek&dzkwFP*ds;x{pp@ue_{zwK%cm0f%*qukUfp`byTekoERG(zb zP&y|t5zk{rx60u_8U5e2llU#_b7#L}68fSh^-4SyEeakk1|H+ArwJd!o&@)TcA?Zb zh=#O!`6MVtyZO6#Cn8S5^w?{u^k{%E*wRmds#9Ln0*?MrzX?Pnd&Ss3t>v zlet;7pzSpFsC)Do)zCy6Up8qt(ro0=OO^UAGNj>v*`@Q&jQ6BQdk}iOB9J#gM0_;9-^cye-g1M=&X^>TW=)jH|#4K zD4#YkX%EzTcpbV-BHx#72HA215URwC5_Ctr1>7DSNfcJG6psuAWaI(^9FxH) ze@ugyW^vQBkciOKNck)vgJED|-UI7k71l#GaG z1=8;A!@-{K$*K$yiR)+z>$(p=u>nsuibpBDfe2>1hJyeEGXWoJP6V7po@r&giK7I9 zzCFV6m<$G?K2<|lc=J|@XCOF_;paZFOm_7}ZBl)fu)#VjPIQiA!Xj`vv zg(Dadg`3oF&u;dmD#GO5R8^>7hVz9kjwUD!45`R`0Li$Yk>9kg(#6>WVxD^tGepM_ul;|;9sMYDmAG#-FHb!{{%!r+B>GY8`1J{^$@}q;x3WSrKIDDMJHUWG7kK}xL>0a zw%80Egk_j3HU-T|g6TL(iWI}%B!$E16hhrl5f(>~f&}WGwr*@F2uWm?vgnH^W9ZwD-r>veCBn@opA9dP4K*ZX$E2Y zA4WUaIEBMy&lA*R1OEt~Pu7ihjJ5olwKlR=h-T4#ZbM01?p)TzUv*7i#X0lTJ=(5M zt&Mi`>$|$6Y?{J87?~yCwx@3+$(N-&cC`-k!edim(7rGwo(+X2pJ@9ESkEtwmXu$>=f!PkJUHAL51 zAJDbdH`BEqp=+_NbZJDpJ~Y;Hor6OXu^SRGE+yjWZVob`smicSTuG(kduq5iynac! zL9%sd>6!1TtAC!z3|48LJtd`Yk5qhn??WmfgPEnGl%r=!j+o8%-9t|i$aGtj)BXE9l>H;-@Fd8sF4s1PyuNF9>D_r&~<4&Ysp+XVwc|>7c zOQS(&+d{wudaFsCZ-_P0z->mMEpFp@;3zQKQ95-wii3z=fUxJ<3?Ki1rxosby^s{n z7#A=BL0z&n8~~(7{pb{QSTACgPDe>LNECji9o3J#_$plRZ9p~*qYQbiE!=1|R=?TF zmgFQ=3>?5ZmA>5_jHTMil$OH4FL#PP)5C$S5iM=>Opb&_>7vMJs<27rLVFRY>HSfj zth59;6mah0KK$x)3QhzklAJ_>#)m*?Mg03xycKjE`<`4P0%vJVS-_F-opN7~Yez;*)f^rY)ipwxLWJvL6|@v)<)0G(f_ z8yC1VlHMmMb*`M1BK-I%E7_MxFq_V*9XZQzc@fQw4%-WDgmf2!wnt%C1dV+TAq6_c zDKbnK%4)nM{Y5NnCJG|Bt|giRIWYt@8{e4z=(T_8+&V46U4!z|AN|MaVpg7siI=JJ zc}mVv@+u{-Apr{P(}}l10&!e{LyRQw9FM03$-yK9FIHDNht5hi$+upCT=hpjEEnLs zGm3ySzSA_&YaUY-M)0>w6dy9BR8@oJ;&XA1ww&iK;0S%udH!>skBy}OhiqBc>wxh% zzsTQZZi7&&7KE|Ua-zl9LQEdJ0wOLF7^}HKoN3VJM?;d>FwXbyzr+TX3Ey9PmA)G! z;ITmpBsR!PN{==Hf#VTE`Zy23ZJgCOgWV1+EtaBA$A){2S%huat)?;^dYVl1XPR)_m$oLTv5>KPTsw*(f07 zyJf9n^LiW63&}VGKNuceCkW&1Uqyu94MQ9%pF=E#GkMJ*&H+PHWCiyHAYVW^A<&R3 z2>2Dp#P86;|1KrpK~foM`|kBNuhI%oSPt3|w-Mmce?V2fP03+O2x=v6rJ4Fjq4+&2 z`+Z8tN7xG3W*)!M{tQ(^Vq#BWziz5K;RB56xak1oW_@aC z8uVIvr(zaHSx-v3IH95&6}XvmY*4bE1@Mq9x>?-KnPo;d1D$voWyT{^P%=d2Qc{OX zQY!L0sJzU;1*j~_zLUEf)bAo%;B7r8`^=ogshO$a+^N#OMYpR4xp8^hY8>U0 z>hAxX9v6R2y{SGF!ebn05^*xmQpER@h{J(y;|>@Z;YeH3myCuQW#BHW(?HzPIA^g9 zv}GW|aMzb%#3inmHM+{O8a|COLza;lhtDL#$N0s@q2LW}4MusExd+EPj}6sDc$(#w zX=@?W{RMbbxFBS0;QE&jZvc4+;fo493k~^^2nZf{9-A+@!iH3(8btg03+SnkP~1?{ z&8T%VgI1uI>Cr>oQHf&dE~EJ#wk!i*6fmIJ3%0YhmPMc04Ba-#y?`JMw!tkKK+1Q1 zB7xQ!9Q;w#0}`E_RyKgz`rr*HkfO>IW5N}&uqTRupS?rXbUlU+837@1FH{Phjo>z5 zZD3#XuO+a64qS00C*v=+3~JjR2Vum&eVi!U7wS}(zLaq*L4s4?rW3=}Uf=W$hs18f z$JTJ<800}OVbe2jUVi=Zo8|KfhD>78o!wUwk+GX?MAjEL0_pw{pEqQPzw?E152*IX z+N+2C<#BnkO3^Z07$j^9=f8nHhtATGi_bvvE)T-$uuF2|Ws+^T4`0OtB(d4!PEf=# z)`8t|yQkfh$S>AqXu-iP*(|!GJ9tJjV0GL{KoBVq5w1)84Uzg=YHOh_ts1PQyfqP9 zb&+6l$Aj(0ad|@kRMAn{)@?bSHpNwrWNG)(etZ@+O@u#ZV3MYBo}{VMnu&W9WL!*` zCGEc^M`31Qt|lqa^^ZiDLczsioU2w@P^(tOA5fuGEk8gGS0h(ju-t+o`9i0}poI3R zkYPmHwYYES;V4=VuoizxiJUttxtWN=!IcCYOHeY^zw=t^gCUV8Dmw;Wa8I@bFak7; zv+d;Ci&?D_U;vBj)WZZNyQo?Avb?=1?b!Sq2g@S#59{^=b;QueMX-& pW=3X4XO7J5oiX;$9kR57@z7AUPjTt=7g=3X{14CQ8S(%C diff --git a/prismatic/vla/datasets/rlds/utils/__pycache__/goal_relabeling.cpython-310.pyc b/prismatic/vla/datasets/rlds/utils/__pycache__/goal_relabeling.cpython-310.pyc deleted file mode 100644 index 1a7a1c6ae62d2cbc72bda9a82806c8f4deec5107..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1186 zcmZWp&u<(x6t+FTHk;jm%CDpn$m(sSc2;U{C`6RP1)&~5>R~S!S(mSJSjz>pd_x0|!;(0lkEZp#uDLK?3pXkT(o^Vu5&k@d}5%QALtpxL4?}XUo=fVrhHX z3f*j(ZkFeYEuVe!_`ze&HeT4}#!A843# S5+&i{A5w~D#e~erhsocia!*$P diff --git a/prismatic/vla/datasets/rlds/utils/__pycache__/task_augmentation.cpython-310.pyc b/prismatic/vla/datasets/rlds/utils/__pycache__/task_augmentation.cpython-310.pyc deleted file mode 100644 index a6178537fc99ae4bcee4c12e89c6ccb8aab879fb..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1702 zcmah~PmA0(6qjU=XV%?6u!L>4Jye8TmNuRidg(GO-6p3}8t9f%7J|`Qnz6IC z8C;N)_@#=Su-B=W&ni_r;R;6Ak6fQ_1d)Ne(C^04}XF^UTYEKdpgosg>ncSIXtcdNJi!+T~nN{Z0UC6&FVF#=xw0N$l2&uRnKmL=kZM`#y@>`{^(qa#wjyyOkpdy zu$lKD1y(sbHl?)VxhqOL#?!z5RCGHNSeEIkIr|uMz=R%>m!C>iDyLE(0wl`<83!Jc zNvShY+OuR6*G0nX!C+y!J$M(=3U@-8bR$<($~x7&LGQ_^3-Ph(n45M3n~O%lL*ZrK z4d-Zg5a l1`sWr-)Ac&Jhbt7jV6_r^HQB*;Vu3tg=9Axv0e62@DCOB;Mf2F diff --git a/vla-scripts/finetune.py b/vla-scripts/finetune.py index bbb85e4..b1472ef 100644 --- a/vla-scripts/finetune.py +++ b/vla-scripts/finetune.py @@ -57,6 +57,7 @@ from prismatic.vla.datasets import RLDSDataset, RLDSBatchTransform from prismatic.vla.datasets.rlds.utils.data_utils import save_dataset_statistics from prismatic.models import load, load_vla +from prismatic.models.pi3_loader import load_pc_model @@ -70,6 +71,7 @@ class FinetuneConfig: vlm_path: str = "openvla/openvla-7b" # Path to OpenVLA model (on HuggingFace Hub or stored locally) use_minivlm: bool = False # resum_vla_path: str = "openvla/openvla-7b" # Path to OpenVLA model (on HuggingFace Hub or stored locally) + pi3_path: Path = Path("/home/ruihengwang/vla/VLA-Adapter/pretrained_models/pi3_checkpoint") # Dataset data_root_dir: Path = Path("datasets/rlds") # Directory containing RLDS datasets @@ -298,7 +300,8 @@ def run_forward_pass( num_patches, compute_diffusion_l1=False, use_pro_version=True, - cfg=None + cfg=None, + **kwargs ) -> Tuple[torch.Tensor, Dict[str, float]]: """ Compute model forward pass and metrics for both training and validation. @@ -927,6 +930,7 @@ def rename_state_dict_keys(state_dict, replace_map): }, to_bf16=True, ) + pi3_model = load_pc_model(cfg.pi3_path).to(device_id) # Get number of vision patches NUM_PATCHES = vla.module.vision_backbone.get_num_patches() * vla.module.vision_backbone.get_num_images_in_input() diff --git a/vla_adapter.egg-info/PKG-INFO b/vla_adapter.egg-info/PKG-INFO deleted file mode 100644 index 5738a98..0000000 --- a/vla_adapter.egg-info/PKG-INFO +++ /dev/null @@ -1,785 +0,0 @@ -Metadata-Version: 2.4 -Name: vla-adapter -Version: 0.0.1 -Summary: VLA-Adapter: An Effective Paradigm for Tiny-Scale Vision-Language-Action Model -Author-email: Yihao Wang , Pengxiang Ding , Lingxiao Li -License: MIT License - - Copyright (c) 2025 Yihao Wang, Pengxiang Ding, Lingxiao Li, Can Cui, Zirui Ge, Xinyang Tong, Wenxuan Song, Han Zhao, Wei Zhao, Pengxu Hou, Siteng Huang, Yifan Tang, Wenhui Wang, Ru Zhang, Jianyi Liu, and Donglin Wang. - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in all - copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. - -Project-URL: homepage, https://github.com/OpenHelix-Team/VLA-Adapter -Project-URL: repository, https://github.com/OpenHelix-Team/VLA-Adapter -Project-URL: documentation, https://github.com/OpenHelix-Team/VLA-Adapter -Keywords: vision-language-action models,tiny-scale backbone,fine-tuning,robotic learning -Classifier: Development Status :: 3 - Alpha -Classifier: Intended Audience :: Developers -Classifier: Intended Audience :: Education -Classifier: Intended Audience :: Science/Research -Classifier: License :: OSI Approved :: MIT License -Classifier: Operating System :: OS Independent -Classifier: Programming Language :: Python :: 3 -Classifier: Programming Language :: Python :: 3.8 -Classifier: Programming Language :: Python :: 3.9 -Classifier: Programming Language :: Python :: 3.10 -Classifier: Programming Language :: Python :: 3 :: Only -Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence -Requires-Python: >=3.8 -Description-Content-Type: text/markdown -License-File: LICENSE -Requires-Dist: accelerate>=0.25.0 -Requires-Dist: draccus==0.8.0 -Requires-Dist: einops -Requires-Dist: huggingface_hub -Requires-Dist: json-numpy -Requires-Dist: jsonlines -Requires-Dist: matplotlib -Requires-Dist: peft==0.11.1 -Requires-Dist: protobuf -Requires-Dist: rich -Requires-Dist: sentencepiece==0.1.99 -Requires-Dist: timm==0.9.10 -Requires-Dist: tokenizers==0.19.1 -Requires-Dist: torch==2.2.0 -Requires-Dist: torchvision==0.17.0 -Requires-Dist: torchaudio==2.2.0 -Requires-Dist: transformers@ git+https://github.com/moojink/transformers-openvla-oft.git -Requires-Dist: wandb -Requires-Dist: tensorflow==2.15.0 -Requires-Dist: tensorflow_datasets==4.9.3 -Requires-Dist: tensorflow_graphics==2021.12.3 -Requires-Dist: dlimp@ git+https://github.com/moojink/dlimp_openvla -Requires-Dist: diffusers==0.30.3 -Requires-Dist: imageio -Requires-Dist: uvicorn -Requires-Dist: fastapi -Requires-Dist: json-numpy -Provides-Extra: dev -Requires-Dist: black>=24.2.0; extra == "dev" -Requires-Dist: gpustat; extra == "dev" -Requires-Dist: ipython; extra == "dev" -Requires-Dist: pre-commit; extra == "dev" -Requires-Dist: ruff>=0.2.2; extra == "dev" -Provides-Extra: sagemaker -Requires-Dist: boto3; extra == "sagemaker" -Requires-Dist: sagemaker; extra == "sagemaker" -Dynamic: license-file - -
- -
- - ->### The official implementation of **VLA-Adapter**. If you want to discuss the VLA-Adapter, please feel free to join our **WeChat group**. The QR code is [here](https://github.com/OpenHelix-Team/VLA-Adapter/issues/1)! -
- -
-

- -

-
- -> **📝 Paper: https://arxiv.org/abs/2502.19645**
-> **🌍 Project page: https://vla-adapter.github.io/**
-> **🤗 HuggingFace: https://huggingface.co/VLA-Adapter**
-> **Github: https://github.com/OpenHelix-Team/VLA-Adapter** - -
- -## :loudspeaker: News! -- **[2025/09/22]** We released our codes! An enhanced **Pro** version is also released (this version conforms to the pipeline in the original paper, but is optimized in implementation). Everyone is welcome to use it!🎉 -- **[2025/09/13]** Our paper won the 🥇**first place** in the [daily list](https://huggingface.co/papers/date/2025-09-12) and the 🥈**second place** in the [weekly list](https://huggingface.co/papers/week/2025-W37) in HF! ⭐ -- **[2025/09/12]** We released the original version of the VLA-Adapter for four LIBERO models on [HuggingFace](https://huggingface.co/VLA-Adapter). -- **[2025/09/11]** We released our paper on [ArXiv](https://arxiv.org/abs/2509.09372). - -
- -## :black_nib: TODO List - -- [x] Release **checkpoints** for reproduction. -- [ ] A more **powerful version**, **VLA-Adapter++**, and a detailed **technical report** 📝 will be released soon.
-- [ ] Continue to update the code to adapt to various **real-world systems** deployments, including the configuration of our paper, Franka, UR-5, and AGILE Piper.
-- [ ] It will soon be compatible with **various foundation models**, including but not limited to [VPP](https://arxiv.org/abs/2412.14803), [π0.5](https://arxiv.org/abs/2504.16054).
-- [ ] We will update the **diffusion transformers** and **flow matching** policy networks in the future, and the results will be updated in the subsequent VLA-Adapter++ technical report. -- [ ] We will also update and give more experiments on **Frozen backbone**. -- [ ] We will expand its **generalization** further in the future. Work is in progress! So please stay tuned! -- [ ] **RL post-training** is also in progress. Interested researchers are welcome to join us in building this foundation! -- [ ] **The dual-system compatibility** of VLA-Adapter is under exploration! - - -
- -## 🌟 Table of Contents - -- [:rocket: Quick Start](#rocket-quick-start) - - [Conda Environment of VLA-Adapter](#conda-environment-of-vla-adapter) - - [Install Dependencies](#install-dependencies) -- [:pencil: Data Preparation](#pencil-data-preparation) - - [LIBERO Benchmark](#libero-benchmark) - - [CALVIN Benchmark](#calvin-benchmark) - - [:video_game: Our Dependencies](#video_game-our-dependencies) - - [:pushpin: Benchmark Location](#pushpin-benchmark-location) -- [⚓ VLM backbone](#vlm) -- [:fire: Training for Different Configurations](#fire-training-for-different-configurations)   => Provides **training configurations** for GPUs ranging from **10GB** to **80GB** of VRAM. - - [:books: Related File for Training](#books-related-file-for-training) - - [:ledger: How to Train on Extremely Limited VRAM GPUs](#ledger-how-to-train-on-extremely-limited-vram-gpus)   => A card with 10GB-12GB *(e.g. NVIDIA GeForce RTX 2080Ti, 3060, 3080, 4070, 4080, and 5070)* - - [:ledger: How to Train on Low VRAM GPUs](#ledger-how-to-train-on-low-vram-gpus)   => A card with 24GB *(e.g. NVIDIA GeForce RTX 3090 and 4090)* - - [:ledger: How to Train on Larger VRAM GPUs](#ledger-how-to-train-on-larger-vram-gpus)   => A Consumer GPU with 32GB *(e.g. NVIDIA GeForce RTX 5090)*   A Professional-Grade GPU with 40GB-48GB *(e.g. NVIDIA A100-40GB, A800-40GB, L20, and RTX A6000).* - - [:ledger: How to Train on Sufficient VRAM GPUs](#ledger-how-to-train-on-sufficient-vram-gpus)   => Professional-Grade GPUs with ≥80GB *(e.g. NVIDIA A100-80GB, A800-80GB, H100, H800, H20-NVLink, and GB200).* -- [:mechanical_arm: Inference](#mechanical_arm-inference) - - [:books: Related File for Inference](#books-related-file-for-inference) - - [🤗 Checkpoint of VLA-Adapter](#ckpts) - - [:notebook: How to Eval](#evals) -- [🌈 Success Rate Comparison](#results) -- [📝 Citation](#cite) -- [:heart: Acknowledgment](#heart-acknowledgment) - -
- -## :rocket: Quick Start - - -### Conda Environment of VLA-Adapter - -```bash -# Create and activate conda environment -conda create -n vla-adapter python=3.10.16 -y -conda activate vla-adapter -``` - -### Install Dependencies - -```bash -# Install PyTorch -# Use a command specific to your machine: https://pytorch.org/get-started/locally/ -pip install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 - -# Clone vla-adapter repo and pip install to download dependencies -git clone https://github.com/OpenHelix-Team/VLA-Adapter.git -cd vla-adapter -pip install -e . - -pip install packaging ninja -ninja --version; echo $? # Verify Ninja --> should return exit code "0" - -# Install Flash Attention 2 for training (https://github.com/Dao-AILab/flash-attention) -pip install "flash-attn==2.5.5" --no-build-isolation -# If you run into difficulty, try `pip cache remove flash_attn` first, or visit the website to download it. (https://github.com/Dao-AILab/flash-attention/releases/tag/v2.5.5) -# You can download the corresponding `.whl` file according to the cuda version of `nvidia-smi`, and then run `pip install flash_attn-2.5.5+cuXX...whl` to install it. -# We use the `flash_attn-2.5.5+cu122torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl` file. -``` - -
-
- - -## :pencil: Data Preparation - -### LIBERO Benchmark - -- **(Optional)** - -Clone and install the [LIBERO repo](https://github.com/Lifelong-Robot-Learning/LIBERO) and required packages: - -```bash -git clone https://github.com/Lifelong-Robot-Learning/LIBERO.git -pip install -e LIBERO -pip install -r experiments/robot/libero/libero_requirements.txt # From vla-adapter base dir -``` - -To download the [LIBERO datasets](https://huggingface.co/datasets/openvla/modified_libero_rlds) that we used in our fine-tuning experiments, run the command below. This will download the `Spatial`, `Object`, `Goal`, and `Long` datasets in `RLDS` format, i.e., `libero_spatial_no_noops`, `libero_object_no_noops`, `libero_goal_no_noops`, `libero_10_no_noops`. (`"_no_noops"` stands for no no-op actions, i.e., training samples with near-zero actions are filtered out). These datasets require `~10GB` of memory in total. If needed, see details on how to download the original non-RLDS datasets [here](https://github.com/openvla/openvla?tab=readme-ov-file#libero-setup). You can use these to fine-tune Prismatic-VLMs (built on Qwen2.5-0.5B) or other VLMs. - -```bash -git clone git@hf.co:datasets/openvla/modified_libero_rlds -``` - -When using LIBERO, you may get an error message like `AttributeError: 'NoneType' object has no attribute 'eglQueryString'`. You can use: - -```bash -sudo apt-get update -sudo apt-get install libgl1-mesa-dev libegl1-mesa-dev libgles2-mesa-dev libglew-dev -``` - -### CALVIN Benchmark - -- **(Optional)** - -```bash -git clone --recurse-submodules https://github.com/mees/calvin.git -export CALVIN_ROOT=$(pwd)/calvin -cd $CALVIN_ROOT - -# Installation of `pyhash` may fail on some machines. If it fails, you can solve it by lowering the `setuptools` version: `pip install setuptools==57.5.0` -sh install.sh -``` - -To download the [CALVIN ABC→D datasets](https://github.com/mees/calvin/tree/main/dataset) that we used in our fine-tuning experiments, run the command below. - -```bash -cd $CALVIN_ROOT/dataset -sh download_data.sh ABC -``` - -If you want to download the RLDS format, you can visit [here](https://huggingface.co/datasets/zhouhongyi/calvin_abc_rlds) to download it. This dataset require `~50GB` of memory. - -When using CALVIN, you may get an error message like `AttributeError: 'NoneType' object has no attribute 'eglQueryString'`. You can use: - -```bash -sudo apt-get update -sudo apt-get install libgl1-mesa-dev libegl1-mesa-dev libgles2-mesa-dev libglew-dev -``` - - -### :video_game: Our Dependencies - -- **(including LIBERO and CALVIN)** - -At this point, the environment is fully installed. If you want to confirm whether the environment is correct, you can see the `our_envs.txt` file we released. - - -### :pushpin: Benchmark Location - -The downloaded dataset can be placed in the `/data` folder. The overall directory structure is as follows: - -``` -· -├── data -· ├── libero - │ ├── libero_10_no_noops - │ │ └── 1.0.0 (It contains some json files and 32 tfrecord files) - │ ├── libero_goal_no_noops - │ │ └── 1.0.0 (It contains some json files and 16 tfrecord files) - │ ├── libero_object_no_noops - │ │ └── 1.0.0 (It contains some json files and 32 tfrecord files) - │ ├── libero_spatial_no_noops - │ │ └── 1.0.0 (It contains some json files and 16 tfrecord files) - │ - ├── calvin_abc - │ └── 1.0.0 (It contains some json files, 512 train tfrecord files, and 32 valid tfrecord files) - │ - └── other benchmarks ... -``` - -
-
- -## ⚓ VLM backbone -We use the `Prismatic-VLMs` architecture. Since the file is large, please download it from [here](https://huggingface.co/Stanford-ILIAD/prism-qwen25-extra-dinosiglip-224px-0_5b). Then put it in the `/pretrained_models` folder. The file structure is: - -``` -· -├── pretrained_models -· ├── configs - └── prism-qwen25-extra-dinosiglip-224px-0_5b -``` - - -
-
- -## :fire: Training for Different Configurations - -**We provide different training configurations for different users. You can choose the configuration suitable for training based on your GPU card type.** - -### :books: Related File for Training -* `vla-scripts/finetune.py`: VLA fine-tuning script - - -### :ledger: How to Train on Extremely Limited VRAM GPUs - -***=> Extremely Limited VRAM (A card with 10GB-12GB) (e.g. NVIDIA GeForce RTX 2080Ti, 3060, 3080, 4070, 4080, and 5070).*** - ->***About `batch_size`, `lora_rank`, `grad_accumulation_steps`, and `max_steps`.*** - -If your resources are extremely limited, you can set `--batch_size 1` and `--lora_rank 64`, it only requires `9.6GB` of VRAM. Certainly, `batch size = 1` will cause gradient updates to be greatly affected by extreme values, and loss convergence will be unstable. In this case, you can modify the `grad_accumulation_steps` parameter to simulate a similar effect. For example, `--batch_size 1` with `--grad_accumulation_steps 8` has a similar effect to `--batch_size 8`, but the training speed will be slower. This means that you can't use the [OpenVLA-OFT](https://github.com/moojink/openvla-oft) model on a card with `10GB` because even with `batch size = 1`, it requires `25GB` of VRAM. Fortunately, you can use VLA-Adapter. However, the `batch size` is still small, you can increase `--max_steps` to achieve the performance reported in the paper. - ->***About `vlm_path`.*** - -The VLM in the VLA-Adapter uses the Prismatic-VLMs architecture, with the LLM backbone being `Qwen2.5-0.5B`. You can download it from https://huggingface.co/Stanford-ILIAD/prism-qwen25-extra-dinosiglip-224px-0_5b and place it in `/pretrained_models/prism-qwen25-extra-dinosiglip-224px-0_5b`. - ->***About `data_name`.*** - -Launch the fine-tuning script with the vla-adapter configuration below. It can run in the background, and the running progress can be seen in the `/logs` folder. You can replace `libero_spatial_no_noops` with `libero_object_no_noops`, `libero_goal_no_noops`, or `libero_10_no_noops`. If you are using the `CALVIN` benchmark, you need to delete `\libero` in `--data_root_dir` and replace `libero_spatial_no_noops` with `calvin_abc`. - ->***About `use_pro_version`.*** - -In addition, we recently released an enhanced version `Pro` of the VLA-Adapter. While its framework remains consistent with the original paper, it has been enhanced in the implementation, resulting in significantly improved performance. **Therefore, we strongly recommend using the Pro version!** The `Pro` version's `Policy` size is `207MB`, and training speed is virtually unchanged. The `original version` is nearly `1GB` smaller than the `pro version`, requiring only `8.6GB` of VRAM. You can choose whether to use the `Pro` version by setting the `use_pro_version` parameter, i.e., the `Pro` version is `--use_pro_version True`. - - ```bash -data_name=libero_spatial_no_noops - -CUDA_VISIBLE_DEVICES=0 torchrun --standalone --nnodes 1 --nproc-per-node 1 vla-scripts/finetune.py \ ---vlm_path pretrained_models/prism-qwen25-extra-dinosiglip-224px-0_5b \ ---config_file_path pretrained_models/configs \ ---data_root_dir data/libero \ ---dataset_name $data_name \ ---run_root_dir outputs \ ---use_film False \ ---num_images_in_input 2 \ ---use_proprio True \ ---use_lora True \ ---use_fz False \ ---use_minivlm True \ ---image_aug True \ ---num_steps_before_decay 400000 \ ---max_steps 400005 \ ---save_freq 5000 \ ---save_latest_checkpoint_only False \ ---merge_lora_during_training True \ ---batch_size 1 \ ---grad_accumulation_steps 8 \ ---learning_rate 2e-4 \ ---lora_rank 64 \ ---use_pro_version True \ ---wandb_entity "YOUR_WANDB_ENTITY" \ ---wandb_project "$data_name" \ ---run_id_note VLA-Adapter--libero_spatial_no_noops--$current_time \ -> logs/VLA-Adapter--libero_spatial_no_noops--$current_time.log 2>&1 & -``` - -Please note that the obtained models will be stored in the `/outputs` folder. Each model will take up nearly `3GB` of memory, so you need to reserve enough space. We strongly recommend that you get our trained model from [VLA-Adapter HuggingFace](https://huggingface.co/VLA-Adapter) and place it in this folder for inference. - -
- -### :ledger: How to Train on Low VRAM GPUs - -***=> Low VRAM (A card with 24GB) (e.g. NVIDIA GeForce RTX 3090 and 4090).*** - ->***About `batch_size`, `lora_rank`, `grad_accumulation_steps`, and `max_steps`.*** - -If you have such a device, you can increase the `batch size` and `lora rank`: `--batch_size 4` and `--lora_rank 64`. This only takes nearly `20GB`. This is consistent with the rank in our paper. This means that you can't use the [OpenVLA-OFT](https://github.com/moojink/openvla-oft) model on a card with `24GB` because even with `batch size = 1`, it requires `25GB` of VRAM. Fortunately, you can use VLA-Adapter. However, the `batch size` is still small, you can increase `--max_steps` to achieve the performance reported in the paper. - ->***About `vlm_path`.*** - -The VLM in the VLA-Adapter uses the Prismatic-VLMs architecture, with the LLM backbone being `Qwen2.5-0.5B`. You can download it from https://huggingface.co/Stanford-ILIAD/prism-qwen25-extra-dinosiglip-224px-0_5b and place it in `/pretrained_models/prism-qwen25-extra-dinosiglip-224px-0_5b`. - ->***About `data_name`.*** - -Launch the fine-tuning script with the vla-adapter configuration below. It can run in the background, and the running progress can be seen in the `/logs` folder. You can replace `libero_spatial_no_noops` with `libero_object_no_noops`, `libero_goal_no_noops`, or `libero_10_no_noops`. If you are using the `CALVIN` benchmark, you need to delete `\libero` in `--data_root_dir` and replace `libero_spatial_no_noops` with `calvin_abc`. - ->***About `use_pro_version`.*** - -In addition, we recently released an enhanced version `Pro` of the VLA-Adapter. While its framework remains consistent with the original paper, it has been enhanced in the implementation, resulting in significantly improved performance. **Therefore, we strongly recommend using the Pro version!** The `Pro` version's `Policy` size is `207MB`, and training speed is virtually unchanged. The `original version` is nearly `1GB` smaller than the `pro version` (1 batch), requiring only `17.6GB` of VRAM. You can choose whether to use the `Pro` version by setting the `use_pro_version` parameter, i.e., the `Pro` version is `--use_pro_version True`. - - - ```bash -data_name=libero_spatial_no_noops - -CUDA_VISIBLE_DEVICES=0 torchrun --standalone --nnodes 1 --nproc-per-node 1 vla-scripts/finetune.py \ ---vlm_path pretrained_models/prism-qwen25-extra-dinosiglip-224px-0_5b \ ---config_file_path pretrained_models/configs \ ---data_root_dir data/libero \ ---dataset_name $data_name \ ---run_root_dir outputs \ ---use_film False \ ---num_images_in_input 2 \ ---use_proprio True \ ---use_lora True \ ---use_fz False \ ---use_minivlm True \ ---image_aug True \ ---num_steps_before_decay 200000 \ ---max_steps 200005 \ ---save_freq 5000 \ ---save_latest_checkpoint_only False \ ---merge_lora_during_training True \ ---batch_size 4 \ ---grad_accumulation_steps 4 \ ---learning_rate 2e-4 \ ---lora_rank 64 \ ---use_pro_version True \ ---wandb_entity "YOUR_WANDB_ENTITY" \ ---wandb_project "$data_name" \ ---run_id_note VLA-Adapter--libero_spatial_no_noops--$current_time \ -> logs/VLA-Adapter--libero_spatial_no_noops--$current_time.log 2>&1 & -``` - -Please note that the obtained models will be stored in the `/outputs` folder. Each model will take up nearly `3GB` of memory, so you need to reserve enough space. We strongly recommend that you get our trained model from [VLA-Adapter HuggingFace](https://huggingface.co/VLA-Adapter) and place it in this folder for inference. - - - -
- -### :ledger: How to Train on Larger VRAM GPUs - -***=> A Consumer GPU with 32GB (e.g. NVIDIA GeForce RTX 5090)
=> A Professional-Grade GPU with 40GB-48GB (e.g. NVIDIA A100-40GB, A800-40GB, L20, and RTX A6000).*** - - ->***About `batch_size`, `lora_rank`, `grad_accumulation_steps`, and `max_steps`.*** - -If you have such a device, you can increase the `batch size` and `lora rank`: `--batch_size 8` and `--lora_rank 64`. This only takes nearly `29GB`. - ->***About `vlm_path`.*** - -The VLM in the VLA-Adapter uses the Prismatic-VLMs architecture, with the LLM backbone being `Qwen2.5-0.5B`. You can download it from https://huggingface.co/Stanford-ILIAD/prism-qwen25-extra-dinosiglip-224px-0_5b and place it in `/pretrained_models/prism-qwen25-extra-dinosiglip-224px-0_5b`. - ->***About `data_name`.*** - -Launch the fine-tuning script with the vla-adapter configuration below. It can run in the background, and the running progress can be seen in the `/logs` folder. You can replace `libero_spatial_no_noops` with `libero_object_no_noops`, `libero_goal_no_noops`, or `libero_10_no_noops`. If you are using the `CALVIN` benchmark, you need to delete `\libero` in `--data_root_dir` and replace `libero_spatial_no_noops` with `calvin_abc`. - -With this configuration, you can achieve the same results as in our paper on the `LIBERO-Object` benchmark, achieving a `99.2%` success rate, in just `8 hours`. The `LIBERO-Spatial` benchmark requires approximately 10 hours of training. However, the `LIBERO-Long` benchmark takes longer because its tasks are longer and more difficult, requiring more training steps to achieve superior performance. - ->***About `use_pro_version`.*** - -In addition, we recently released an enhanced version `Pro` of the VLA-Adapter. While its framework remains consistent with the original paper, it has been enhanced in the implementation, resulting in significantly improved performance. **Therefore, we strongly recommend using the Pro version!** The `Pro` version's `Policy` size is `207MB`, and training speed is virtually unchanged. The `original version` is nearly `1GB` smaller than the `pro version` (1 batch). You can choose whether to use the `Pro` version by setting the `use_pro_version` parameter, i.e., the `Pro` version is `--use_pro_version True`. - - ```bash -data_name=libero_spatial_no_noops - -CUDA_VISIBLE_DEVICES=0 torchrun --standalone --nnodes 1 --nproc-per-node 1 vla-scripts/finetune.py \ ---vlm_path pretrained_models/prism-qwen25-extra-dinosiglip-224px-0_5b \ ---config_file_path pretrained_models/configs \ ---data_root_dir data/libero \ ---dataset_name $data_name \ ---run_root_dir outputs \ ---use_film False \ ---num_images_in_input 2 \ ---use_proprio True \ ---use_lora True \ ---use_fz False \ ---use_minivlm True \ ---image_aug True \ ---num_steps_before_decay 200000 \ ---max_steps 200005 \ ---save_freq 5000 \ ---save_latest_checkpoint_only False \ ---merge_lora_during_training True \ ---batch_size 8 \ ---grad_accumulation_steps 2 \ ---learning_rate 2e-4 \ ---lora_rank 64 \ ---use_pro_version True \ ---wandb_entity "YOUR_WANDB_ENTITY" \ ---wandb_project "$data_name" \ ---run_id_note VLA-Adapter--libero_spatial_no_noops--$current_time \ -> logs/VLA-Adapter--libero_spatial_no_noops--$current_time.log 2>&1 & -``` - -Please note that the obtained models will be stored in the `/outputs` folder. Each model will take up nearly `3GB` of memory, so you need to reserve enough space. We strongly recommend that you get our trained model from [VLA-Adapter HuggingFace](https://huggingface.co/VLA-Adapter) and place it in this folder for inference. - - - -
- -### :ledger: How to Train on Sufficient VRAM GPUs - -***=> Professional-Grade GPUs with ≥80GB (e.g. NVIDIA A100-80GB, A800-80GB, H100, H800, H20-NVLink, and GB200).*** - ->***About `batch_size`, `lora_rank`, `grad_accumulation_steps`, and `max_steps`.*** - -You can use 1 to 8 GPUs for training by changing the number of `CUDA_VISIBLE_DEVICES` to the GPU number and the number of GPUs after `--nproc-per-node`. In our paper, we use 4×H100 GPU for training. In this configuration, the four suites of the LIBERO benchmark, `Spatial` (only five hours), `Object` (less than one hour), `Goal` (three hours), and `Long` (half a day); the `CALVIN` benchmark (eight hours) - ->***About `vlm_path`.*** - -The VLM in the VLA-Adapter uses the Prismatic-VLMs architecture, with the LLM backbone being `Qwen2.5-0.5B`. You can download it from https://huggingface.co/Stanford-ILIAD/prism-qwen25-extra-dinosiglip-224px-0_5b and place it in `/pretrained_models/prism-qwen25-extra-dinosiglip-224px-0_5b`. - ->***About `data_name`.*** - -Launch the fine-tuning script with the vla-adapter configuration below. It can run in the background, and the running progress can be seen in the `/logs` folder. You can replace `libero_spatial_no_noops` with `libero_object_no_noops`, `libero_goal_no_noops`, or `libero_10_no_noops`. If you are using the `CALVIN` benchmark, you need to delete `\libero` in `--data_root_dir` and replace `libero_spatial_no_noops` with `calvin_abc`. - - ->***About `use_pro_version`.*** - -In addition, we recently released an enhanced version `Pro` of the VLA-Adapter. While its framework remains consistent with the original paper, it has been enhanced in the implementation, resulting in significantly improved performance. **Therefore, we strongly recommend using the Pro version!** The `Pro` version's `Policy` size is `207MB`, and training speed is virtually unchanged. The `original version` is nearly `1GB` smaller than the `pro version` (1 batch). You can choose whether to use the `Pro` version by setting the `use_pro_version` parameter, i.e., the `Pro` version is `--use_pro_version True`. - -```bash -data_name=libero_spatial_no_noops - -CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --standalone --nnodes 1 --nproc-per-node 4 vla-scripts/finetune.py \ ---vlm_path pretrained_models/prism-qwen25-extra-dinosiglip-224px-0_5b \ ---config_file_path pretrained_models/configs \ ---data_root_dir data/libero \ ---dataset_name $data_name \ ---run_root_dir outputs \ ---use_film False \ ---num_images_in_input 2 \ ---use_proprio True \ ---use_lora True \ ---use_fz False \ ---use_minivlm True \ ---image_aug True \ ---num_steps_before_decay 150000 \ ---max_steps 150005 \ ---save_freq 5000 \ ---save_latest_checkpoint_only False \ ---merge_lora_during_training True \ ---batch_size 16 \ ---grad_accumulation_steps 1 \ ---learning_rate 2e-4 \ ---lora_rank 64 \ ---use_pro_version True \ ---wandb_entity "YOUR_WANDB_ENTITY" \ ---wandb_project "$data_name" \ ---run_id_note VLA-Adapter--spatial--$current_time \ -> logs/VLA-Adapter--spatial--$current_time.log 2>&1 & -``` - -Please note that the obtained models will be stored in the `/outputs` folder. Each model will take up nearly `3GB` of memory, so you need to reserve enough space. We strongly recommend that you get our trained model from [VLA-Adapter HuggingFace](https://huggingface.co/VLA-Adapter) and place it in this folder for inference. - -## :mechanical_arm: Inference - -### :books: Related File for Inference -* `experiments/robot/libero/`: LIBERO eval files - * `run_libero_eval.py`: LIBERO eval script - * `libero_utils.py`: LIBERO eval utils -* `experiments/robot/`: General eval utils files - * `openvla_utils.py`: VLA-specific eval utils - * `robot_utils.py`: Other eval utils - -
- -### 🤗 Checkpoint of VLA-Adapter -We fine-tuned `Qwen2.5-0.5B` with our adapter bridge paradigm on four LIBERO task suites independently: `LIBERO-Spatial`, `LIBERO-Object`, `LIBERO-Goal`, and `LIBERO-Long`. -The four VLA-Adapter checkpoints for LIBERO are available on Hugging Face: -* [VLA-Adapter/LIBERO-Spatial](https://huggingface.co/VLA-Adapter/LIBERO-Spatial) -* [VLA-Adapter/LIBERO-Object](https://huggingface.co/VLA-Adapter/LIBERO-Object) -* [VLA-Adapter/LIBERO-Goal](https://huggingface.co/VLA-Adapter/LIBERO-Goal) -* [VLA-Adapter/LIBERO-Long](https://huggingface.co/VLA-Adapter/LIBERO-Long) - -In addition, we also provide a `Pro` version, we used `4*H100` GPUs for training, `--batch_size 16`, `--lora rank 64`, and the `--max_steps 100000`. The Pro checkpoints is: - -* [VLA-Adapter/LIBERO-Spatial-Pro](https://huggingface.co/VLA-Adapter/LIBERO-Spatial-Pro) `(97.8 -> 99.6)` -* [VLA-Adapter/LIBERO-Object-Pro](https://huggingface.co/VLA-Adapter/LIBERO-Object-Pro) `(99.2 -> 99.6)` -* [VLA-Adapter/LIBERO-Goal-Pro](https://huggingface.co/VLA-Adapter/LIBERO-Goal-Pro) `(97.2 -> 98.2)` -* [VLA-Adapter/LIBERO-Long-Pro](https://huggingface.co/VLA-Adapter/LIBERO-Long-Pro) `(95.0 -> 96.4)` -* [VLA-Adapter/CALVIN-ABC-Pro](https://huggingface.co/VLA-Adapter/CALVIN-ABC-Pro) `(4.42 -> 4.50)` - -These files need to be placed in the `/output` folder. If you trained your own models, it will also be stored here. The subsequent eval code will call the model in this folder for inference. - - -
- - -### :notebook: How to Eval - -**We strongly recommend that you use our open source `Pro` version of the model, which has stronger performance.** To start evaluations with one of these checkpoints, run one of the commands below. Each will automatically download the appropriate checkpoint listed above. If you want to use the original version of the model, you only need to adjust the `-- use_pro_version` parameter to `False` and pass the original version of the model to the `--pretrained_checkpoint` parameter. Finally, the inference results will be displayed in the `/eval_logs` folder, and the inference video will be displayed in the `/rollouts/vla-adapter` folder. - - -```bash -# Launch LIBERO-Spatial-Pro evals (Background running) -CUDA_VISIBLE_DEVICES=0 python experiments/robot/libero/run_libero_eval.py \ - --use_proprio True \ - --num_images_in_input 2 \ - --use_film False \ - --pretrained_checkpoint outputs/LIBERO-Spatial-Pro \ - --task_suite_name libero_spatial \ - --use_pro_version True \ - > eval_logs/Spatial--chkpt.log 2>&1 & - - -# Launch LIBERO-Object-Pro evals (Background running) -CUDA_VISIBLE_DEVICES=0 python experiments/robot/libero/run_libero_eval.py \ - --use_proprio True \ - --num_images_in_input 2 \ - --use_film False \ - --pretrained_checkpoint outputs/LIBERO-Object-Pro \ - --task_suite_name libero_object \ - --use_pro_version True \ - > eval_logs/Object--chkpt.log 2>&1 & - - -# Launch LIBERO-Goal-Pro evals (Background running) -CUDA_VISIBLE_DEVICES=0 python experiments/robot/libero/run_libero_eval.py \ - --use_proprio True \ - --num_images_in_input 2 \ - --use_film False \ - --pretrained_checkpoint outputs/LIBERO-Goal-Pro \ - --task_suite_name libero_goal \ - --use_pro_version True \ - > eval_logs/Goal--chkpt.log 2>&1 & - - -# Launch LIBERO-Long-Pro (LIBERO-10) evals (Background running) -CUDA_VISIBLE_DEVICES=0 python experiments/robot/libero/run_libero_eval.py \ - --use_proprio True \ - --num_images_in_input 2 \ - --use_film False \ - --pretrained_checkpoint outputs/LIBERO-long-Pro \ - --task_suite_name libero_10 \ - --use_pro_version True \ - > eval_logs/Long--chkpt.log 2>&1 & - - -# Launch CALVIN ABC→D-Pro evals (Background running) -CUDA_VISIBLE_DEVICES=0 python vla-scripts/evaluate_calvin.py \ - --pretrained_checkpoint outputs/CALVIN-ABC-Pro \ - > eval_logs/CALVIN--ABC.log 2>&1 & -``` - -The evaluation script will run 500 trials by default (10 tasks x 50 episodes each) in LIBERO and 1,000 task sequences in CALVIN. Use the same card for training and inference whenever possible. **Note that results may vary slightly if you use a different GPU than the H100.** - - -If you want to get the inference **throughput**, you can run it in the `run_libero_eval.py` file. You can add `start = time.time()` and `end = time.time()` before and after `lines 334--345` and calculate the difference between the two. This difference is the time it takes to generate `8 chunks`. This gives you the inference throughput. We measured it multiple times and took the average value of `0.036s`. - -
- -## 🌈 Success Rate Comparison - -All our results are inferred on `H100`. You can find the inference `log` file in the model released on [HF](https://huggingface.co/VLA-Adapter) for viewing. - -### Performance on LIBERO benchmark. - -XX represents the best performance, XX represents the second best performance, and XX* represents the third best performance. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
LIBERO MethodsScale SpatialObject GoalLong Avg.
Large-scaleFlowVLA (Zhong et al., 2025)8.5B93.295.091.672.688.1
UnifiedVLA (Wang et al., 2025)8.5B95.498.8* 93.6 94.0 95.5
OpenVLA (Kim et al., 2024)7B84.788.479.253.776.5
OpenVLA-OFT (Kim et al., 2025)7B97.6*98.497.994.5*97.1*
UniVLA (Bu et al., 2025)7B96.5 96.8 95.6 92.0 95.2
CoT-VLA (Zhao et al., 2025)7B87.5 91.6 87.6 69.0 81.1
WorldVLA (Cen et al., 2025)7B87.6 96.2 83.4 60.0 81.8
TraceVLA (Zheng et al., 2025)7B84.6 85.2 75.1 54.1 74.8
MolmoAct (Lee et al., 2025)7B87.0 95.4 87.6 77.2 86.6
ThinkAct (Huang et al., 2025)7B88.3 91.4 87.1 70.9 84.4
Small-scale4D-VLA (Zhang et al., 2025)4B88.9 95.2 90.9 79.1 88.6
SpatialVLA (Qu et al., 2025)4B88.2 89.9 78.6 55.5 78.1
π0 (Black et al., 2024)3B96.898.8*95.8 85.2 94.2
π0-FAST (Pertsch et al., 2025)3B96.4 96.8 88.6 60.2 85.5
NORA (Hung et al., 2025)3B92.2 95.4 89.4 74.6 87.9
SmolVLA (Shukor et al., 2025)2.2B93.0 94.0 91.0 77.0 88.8
GR00T N1 (NVIDIA et al., 2025)2B94.4 97.6 93.0 90.6 93.9
Tiny-scaleSeer (Tian et al., 2025)0.57B- - - 78.7 78.7
VLA-OS (Gao et al., 2025)0.5B87.0 96.5 92.7 66.0 85.6
Diffusion Policy (Chi et al., 2023)-78.3 92.5 68.3 50.5 72.4
VLA-Adapter (Ours)0.5B97.899.297.2* 95.0 97.3
VLA-Adapter-Pro (Ours)0.5B99.699.6 98.296.498.5
- -### Performance on CALVIN ABC→D benchmark. - -XX represents the best performance, XX represents the second best performance, and XX* represents the third best performance. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
CALVIN MethodsScale 12 34 5 Avg. len
Large-scaleUniVLA (Bu et al., 2025) 7B 95.5 85.8 75.4 66.9 56.5 3.80
OpenVLA (Kim et al., 2024) 7B 91.3 77.8 62.0 52.1 43.5 3.27
OpenVLA-OFT (Kim et al., 2025) 7B 96.3 89.1 82.4 75.8 66.5 4.10
VLAS (Zhao et al., 2025b) 7B 87.2 64.2 40.9 28.1 19.6 2.40
LCB (Shentu et al., 2024) 7B 73.6 50.2 28.5 16.0 9.9 1.78
RoboDual (Bu et al., 2024a) 7B 94.4 82.7 72.1 62.4 54.4 3.66
OpenHelix (Cui et al., 2025) 7B 97.1* 91.4 82.8 72.6 64.1 4.08
ReconVLA (Song et al., 2025c) 7B 95.6 87.6 76.9 69.3 64.1 3.95
Small-scaleDeeR (Yue et al., 2024) 3B 86.2 70.1 51.8 41.5 30.4 2.82
RoboFlamingo (Li et al., 2024b) 3B 82.4 61.9 46.6 33.1 23.5 2.48
VPP (Hu et al., 2025) 1.5B 95.7 91.2 86.3* 81.0* 75.0* 4.33*
SuSIE (Black et al., 2024)1.3B 87.0 69.0 49.0 38.0 26.0 2.69
Tiny-scaleSeer-Large (Tian et al., 2025)0.57B 96.3 91.6* 86.1 80.3 74.0 4.28
MoDE (Reuss et al., 2025) 0.44B 96.2 88.9 81.1 71.8 63.5 4.01
Seer (Tian et al., 2025) 0.32B 94.4 87.2 79.9 72.2 64.3 3.98
VLA-Adapter (Ours)0.5B99.1 94.6 88.8 82.8 76.5 4.42
VLA-Adapter-Pro (Ours)0.5B98.595.0 90.585.380.04.50
- - -
- - -## 📝 Citation - -### 🫶 If you feel that this paper, models, or codes are helpful, please cite our paper, thanks for your support of VLA-Adapter! - -```bibtex -@article{wang2025vlaadapter, - author={Wang, Yihao and Ding, Pengxiang and Li, Lingxiao and Cui, Can and Ge, Zirui and Tong, Xinyang and Song, Wenxuan and Zhao, Han and Zhao, Wei and Hou, Pengxu and Huang, Siteng and Tang, Yifan and Wang, Wenhui and Zhang, Ru and Liu, Jianyi and Wang, Donglin}, - title={VLA-Adapter: An Effective Paradigm for Tiny-Scale Vision-Language-Action Model}, - journal={arXiv preprint arXiv:2509.09372}, - year={2025} -} -``` - -## :heart: Acknowledgment - -We thank [OpenVLA-OFT](https://github.com/moojink/openvla-oft), [MiniVLA](https://github.com/Stanford-ILIAD/openvla-mini), and [RoboDual](https://github.com/OpenDriveLab/RoboDual) for their open-sourced work! diff --git a/vla_adapter.egg-info/SOURCES.txt b/vla_adapter.egg-info/SOURCES.txt deleted file mode 100644 index 6cdf1af..0000000 --- a/vla_adapter.egg-info/SOURCES.txt +++ /dev/null @@ -1,118 +0,0 @@ -LICENSE -README.md -pyproject.toml -experiments/robot/openvla_utils.py -experiments/robot/robot_utils.py -experiments/robot/libero/libero_utils.py -experiments/robot/libero/regenerate_libero_dataset.py -experiments/robot/libero/run_libero_eval.py -pretrained_models/configs/configuration_prismatic.py -pretrained_models/configs/modeling_prismatic.py -pretrained_models/configs/processing_prismatic.py -prismatic/__init__.py -prismatic/py.typed -prismatic/conf/__init__.py -prismatic/conf/datasets.py -prismatic/conf/models.py -prismatic/conf/vla.py -prismatic/extern/__init__.py -prismatic/extern/hf/__init__.py -prismatic/extern/hf/configuration_prismatic.py -prismatic/extern/hf/modeling_prismatic.py -prismatic/extern/hf/processing_prismatic.py -prismatic/models/__init__.py -prismatic/models/action_heads.py -prismatic/models/film_vit_wrapper.py -prismatic/models/load.py -prismatic/models/materialize.py -prismatic/models/projectors.py -prismatic/models/registry.py -prismatic/models/transformer_utils.py -prismatic/models/backbones/__init__.py -prismatic/models/backbones/llm/__init__.py -prismatic/models/backbones/llm/base_llm.py -prismatic/models/backbones/llm/llama2.py -prismatic/models/backbones/llm/mistral.py -prismatic/models/backbones/llm/phi.py -prismatic/models/backbones/llm/qwen25.py -prismatic/models/backbones/llm/prompting/__init__.py -prismatic/models/backbones/llm/prompting/base_prompter.py -prismatic/models/backbones/llm/prompting/llama2_chat_prompter.py -prismatic/models/backbones/llm/prompting/mistral_instruct_prompter.py -prismatic/models/backbones/llm/prompting/phi_prompter.py -prismatic/models/backbones/llm/prompting/qwen_prompter.py -prismatic/models/backbones/llm/prompting/vicuna_v15_prompter.py -prismatic/models/backbones/vision/__init__.py -prismatic/models/backbones/vision/base_vision.py -prismatic/models/backbones/vision/clip_vit.py -prismatic/models/backbones/vision/dinoclip_vit.py -prismatic/models/backbones/vision/dinosiglip_vit.py -prismatic/models/backbones/vision/dinov2_vit.py -prismatic/models/backbones/vision/in1k_vit.py -prismatic/models/backbones/vision/siglip_vit.py -prismatic/models/vlas/__init__.py -prismatic/models/vlas/openvla.py -prismatic/models/vlms/__init__.py -prismatic/models/vlms/base_vlm.py -prismatic/models/vlms/prismatic.py -prismatic/overwatch/__init__.py -prismatic/overwatch/overwatch.py -prismatic/preprocessing/__init__.py -prismatic/preprocessing/download.py -prismatic/preprocessing/materialize.py -prismatic/preprocessing/datasets/__init__.py -prismatic/preprocessing/datasets/datasets.py -prismatic/training/__init__.py -prismatic/training/materialize.py -prismatic/training/metrics.py -prismatic/training/train_utils.py -prismatic/training/strategies/__init__.py -prismatic/training/strategies/base_strategy.py -prismatic/training/strategies/ddp.py -prismatic/training/strategies/fsdp.py -prismatic/util/__init__.py -prismatic/util/batching_utils.py -prismatic/util/data_utils.py -prismatic/util/nn_utils.py -prismatic/util/torch_utils.py -prismatic/vla/__init__.py -prismatic/vla/action_tokenizer.py -prismatic/vla/constants.py -prismatic/vla/materialize.py -prismatic/vla/datasets/__init__.py -prismatic/vla/datasets/datasets.py -prismatic/vla/datasets/rlds/__init__.py -prismatic/vla/datasets/rlds/dataset.py -prismatic/vla/datasets/rlds/obs_transforms.py -prismatic/vla/datasets/rlds/traj_transforms.py -prismatic/vla/datasets/rlds/oxe/__init__.py -prismatic/vla/datasets/rlds/oxe/configs.py -prismatic/vla/datasets/rlds/oxe/materialize.py -prismatic/vla/datasets/rlds/oxe/mixtures.py -prismatic/vla/datasets/rlds/oxe/transforms.py -prismatic/vla/datasets/rlds/oxe/utils/droid_utils.py -prismatic/vla/datasets/rlds/utils/__init__.py -prismatic/vla/datasets/rlds/utils/data_utils.py -prismatic/vla/datasets/rlds/utils/goal_relabeling.py -prismatic/vla/datasets/rlds/utils/task_augmentation.py -scripts/generate.py -scripts/preprocess.py -scripts/pretrain.py -scripts/additional-datasets/lrv_instruct.py -scripts/additional-datasets/lvis_instruct_4v.py -scripts/extern/convert_prismatic_weights_to_hf.py -scripts/extern/verify_prismatic.py -vla-scripts/calvin_env_wrapper.py -vla-scripts/deploy.py -vla-scripts/evaluate_calvin.py -vla-scripts/finetune.py -vla-scripts/merge_lora_weights_and_save.py -vla-scripts/train.py -vla-scripts/vla_evaluation.py -vla-scripts/extern/convert_openvla_weights_to_hf.py -vla-scripts/extern/verify_openvla.py -vla_adapter.egg-info/PKG-INFO -vla_adapter.egg-info/SOURCES.txt -vla_adapter.egg-info/dependency_links.txt -vla_adapter.egg-info/requires.txt -vla_adapter.egg-info/top_level.txt \ No newline at end of file diff --git a/vla_adapter.egg-info/dependency_links.txt b/vla_adapter.egg-info/dependency_links.txt deleted file mode 100644 index 8b13789..0000000 --- a/vla_adapter.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/vla_adapter.egg-info/requires.txt b/vla_adapter.egg-info/requires.txt deleted file mode 100644 index 839cc6c..0000000 --- a/vla_adapter.egg-info/requires.txt +++ /dev/null @@ -1,38 +0,0 @@ -accelerate>=0.25.0 -draccus==0.8.0 -einops -huggingface_hub -json-numpy -jsonlines -matplotlib -peft==0.11.1 -protobuf -rich -sentencepiece==0.1.99 -timm==0.9.10 -tokenizers==0.19.1 -torch==2.2.0 -torchvision==0.17.0 -torchaudio==2.2.0 -transformers@ git+https://github.com/moojink/transformers-openvla-oft.git -wandb -tensorflow==2.15.0 -tensorflow_datasets==4.9.3 -tensorflow_graphics==2021.12.3 -dlimp@ git+https://github.com/moojink/dlimp_openvla -diffusers==0.30.3 -imageio -uvicorn -fastapi -json-numpy - -[dev] -black>=24.2.0 -gpustat -ipython -pre-commit -ruff>=0.2.2 - -[sagemaker] -boto3 -sagemaker diff --git a/vla_adapter.egg-info/top_level.txt b/vla_adapter.egg-info/top_level.txt deleted file mode 100644 index 6720ec3..0000000 --- a/vla_adapter.egg-info/top_level.txt +++ /dev/null @@ -1,7 +0,0 @@ -eval_logs -experiments -figure -pretrained_models -prismatic -scripts -vla-scripts From ff9a926a510311bbfbc46f3d986cb474d00279a2 Mon Sep 17 00:00:00 2001 From: ruiheng123 Date: Mon, 3 Nov 2025 09:10:41 +0800 Subject: [PATCH 3/6] feature(wrh): add pi3 injection --initial version --- eval.sh | 11 +- experiments/robot/libero/run_libero_eval.py | 25 +- experiments/robot/openvla_utils.py | 31 ++- experiments/robot/robot_utils.py | 5 +- .../configs/modeling_prismatic.py | 6 +- prismatic/extern/hf/modeling_prismatic.py | 6 +- prismatic/models/action_heads.py | 259 +++++++++++++++++- run.sh | 14 +- vla-scripts/finetune.py | 37 ++- 9 files changed, 357 insertions(+), 37 deletions(-) diff --git a/eval.sh b/eval.sh index 0b4e84b..a15a096 100644 --- a/eval.sh +++ b/eval.sh @@ -1,9 +1,14 @@ -CUDA_VISIBLE_DEVICES=3 python experiments/robot/libero/run_libero_eval.py \ +export HF_HUB_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 +export HF_DATASETS_OFFLINE=1 +CUDA_VISIBLE_DEVICES=1 python experiments/robot/libero/run_libero_eval.py \ --use_proprio True \ --num_images_in_input 2 \ --use_film False \ - --pretrained_checkpoint outputs/LIBERO-Long-Pro \ + --pretrained_checkpoint outputs/configs+libero_10_no_noops+b16+lr-0.0002+lora-r64+dropout-0.0--image_aug--VLA-Adapter--libero_10_no_noops--2025_10_27_17_22_41--use_3d_True_dim_2048_inject_all--170000_chkpt \ --task_suite_name libero_10 \ --use_pro_version True \ + --use_3d True \ + --inject_layers all \ # outputs/configs+libero_10_no_noops+b16+lr-0.0002+lora-r64+dropout-0.0--image_aug--VLA-Adapter--libero_10_no_noops--1759126170--160000_chkpt \ -# > eval_logs/Spatial--chkpt.log 2>&1 & \ No newline at end of file + # > eval_logs/Spatial--chkpt.log 2>&1 & \ No newline at end of file diff --git a/experiments/robot/libero/run_libero_eval.py b/experiments/robot/libero/run_libero_eval.py index bd82b14..88a76f2 100644 --- a/experiments/robot/libero/run_libero_eval.py +++ b/experiments/robot/libero/run_libero_eval.py @@ -12,7 +12,7 @@ from dataclasses import dataclass from enum import Enum from pathlib import Path -from typing import Optional, Union +from typing import Optional, Union, List import draccus import numpy as np @@ -48,7 +48,7 @@ set_seed_everywhere, ) from prismatic.vla.constants import NUM_ACTIONS_CHUNK - +from prismatic.models.pi3_loader import load_pc_model # Define task suite constants class TaskSuite(str, Enum): @@ -128,6 +128,11 @@ class GenerateConfig: use_pro_version: bool = True # encourage to use the pro models we released. phase: str = "Inference" + use_3d: bool = False + dim_3d: int = 2048 + pi3_path: Path = Path("/home/ruihengwang/vla/VLA-Adapter/pretrained_models/pi3_checkpoint") + inject_layers: Optional[int | List[int] | str] = None + def validate_config(cfg: GenerateConfig) -> None: @@ -292,6 +297,7 @@ def run_episode( noisy_action_projector=None, initial_state=None, log_file=None, + pi3_model=None ): """Run a single episode in the environment.""" # Reset environment @@ -342,7 +348,8 @@ def run_episode( proprio_projector=proprio_projector, noisy_action_projector=noisy_action_projector, use_film=cfg.use_film, - use_minivlm=cfg.use_minivlm + use_minivlm=cfg.use_minivlm, + pi3_model=pi3_model ) action_queue.extend(actions) @@ -383,7 +390,8 @@ def run_task( total_episodes=0, total_successes=0, log_file=None, - save_version=None + save_version=None, + pi3_model=None ): """Run evaluation for a single task.""" # Get task @@ -433,6 +441,7 @@ def run_task( noisy_action_projector, initial_state, log_file, + pi3_model ) # Update counters @@ -483,6 +492,10 @@ def eval_libero(cfg: GenerateConfig) -> float: # Initialize model and components model, action_head, proprio_projector, noisy_action_projector, processor = initialize_model(cfg) + if cfg.use_3d: + pi3_model = load_pc_model(cfg.pi3_path) + else: + pi3_model = None # for name, param in model.named_parameters(): # if 'action_queries' in name: @@ -500,6 +513,7 @@ def eval_libero(cfg: GenerateConfig) -> float: num_tasks = task_suite.n_tasks log_message(f"Task suite: {cfg.task_suite_name}", log_file) + log_message(f"Using pretrained checkpoint: {cfg.pretrained_checkpoint}", log_file) # Start evaluation total_episodes, total_successes = 0, 0 @@ -517,7 +531,8 @@ def eval_libero(cfg: GenerateConfig) -> float: total_episodes, total_successes, log_file, - cfg.save_version + cfg.save_version, + pi3_model ) # Calculate final success rate diff --git a/experiments/robot/openvla_utils.py b/experiments/robot/openvla_utils.py index 03cb7c5..6278c2d 100644 --- a/experiments/robot/openvla_utils.py +++ b/experiments/robot/openvla_utils.py @@ -32,7 +32,7 @@ ACTION_PROPRIO_NORMALIZATION_TYPE, ) from prismatic.vla.datasets.rlds.utils.data_utils import NormalizationType - +from prismatic.models.pi3_loader import load_pc_model # Initialize important constants DATE = time.strftime("%Y_%m_%d") DATE_TIME = time.strftime("%Y_%m_%d-%H_%M_%S") @@ -507,6 +507,9 @@ def get_action_head(cfg: Any, llm_dim: int) -> Union[L1RegressionActionHead]: hidden_dim=llm_dim, action_dim=ACTION_DIM, use_pro_version=cfg.use_pro_version, + use_3d=cfg.use_3d, + dim_3d=cfg.dim_3d, + inject_layers=cfg.inject_layers, ) else: @@ -745,6 +748,8 @@ def get_vla_action( noisy_action_projector: Optional[torch.nn.Module] = None, use_film: bool = False, use_minivlm: bool = False, + use_3d_model: bool = False, + pi3_model: Optional[torch.nn.Module] = None ) -> List[np.ndarray]: """ Generate action predictions with the VLA policy. @@ -764,6 +769,11 @@ def get_vla_action( List[np.ndarray]: Predicted actions """ with torch.inference_mode(): + if use_3d_model: + assert pi3_model is not None + pi3_model = pi3_model.to(DEVICE).to(torch.bfloat16) + + # Collect all input images all_images = [obs["full_image"]] @@ -795,6 +805,24 @@ def get_vla_action( all_wrist_pixel_values = [wrist_inputs["pixel_values"] for wrist_inputs in all_wrist_inputs] inputs["pixel_values"] = torch.cat([primary_pixel_values] + all_wrist_pixel_values, dim=1) + if use_3d_model: + img_1, img_2 = inputs["pixel_values"][:, 0:3, :, :].to(DEVICE).to(torch.bfloat16), inputs["pixel_values"][:, 6:9, :, :].to(DEVICE).to(torch.bfloat16) + pi3_num_reg_token = 5 + + img_tensor = torch.stack([img_1, img_2], dim=1) # [B, 2, 3, H, W] where 2 indicates 2 views + B, N, _, H, W = img_tensor.shape + img_tensor = img_tensor.reshape((B*N, _, H, W)) + hidden = pi3_model.encoder(img_tensor, is_training=True) + if isinstance(hidden, dict): + hidden = hidden["x_norm_patchtokens"] + hidden, pos = pi3_model.decode(hidden, N, H, W) + hidden = hidden[:, pi3_num_reg_token:, :] + L_3d, dim_3d = hidden.shape[-2:] + hidden = hidden.reshape(B, -1, L_3d, dim_3d) + hidden = hidden.reshape(B, -1, dim_3d) + else: + hidden = None + # Process proprioception data if used proprio = None if cfg.use_proprio: @@ -819,6 +847,7 @@ def get_vla_action( noisy_action_projector=noisy_action_projector, action_head=action_head, use_film=use_film, + hidden_3d=hidden ) # Extract subset of actions for open loop steps diff --git a/experiments/robot/robot_utils.py b/experiments/robot/robot_utils.py index 61cedba..32c7806 100644 --- a/experiments/robot/robot_utils.py +++ b/experiments/robot/robot_utils.py @@ -107,6 +107,7 @@ def get_action( noisy_action_projector: Optional[torch.nn.Module] = None, use_film: bool = False, use_minivlm: bool = False, + pi3_model: Optional[torch.nn.Module] = None ) -> Union[List[np.ndarray], np.ndarray]: """ Query the model to get action predictions. @@ -140,7 +141,9 @@ def get_action( proprio_projector=proprio_projector, noisy_action_projector=noisy_action_projector, use_film=use_film, - use_minivlm=use_minivlm + use_minivlm=use_minivlm, + use_3d_model=cfg.use_3d, + pi3_model=pi3_model ) else: raise ValueError(f"Unsupported model family: {cfg.model_family}") diff --git a/pretrained_models/configs/modeling_prismatic.py b/pretrained_models/configs/modeling_prismatic.py index 968b95c..24bb0a4 100644 --- a/pretrained_models/configs/modeling_prismatic.py +++ b/pretrained_models/configs/modeling_prismatic.py @@ -879,6 +879,7 @@ def _regression_or_discrete_prediction( action_head=None, proprio=None, proprio_projector=None, + hidden_3d=None ): """Run L1 regression-based continuous action prediction or discrete action tokens prediction.""" @@ -929,7 +930,8 @@ def _regression_or_discrete_prediction( # L1 regression prediction normalized_actions = action_head.predict_action(multi_layer_hidden_states, proprio=proprio, - proprio_projector=proprio_projector) + proprio_projector=proprio_projector, + hidden_3d=hidden_3d) normalized_actions = normalized_actions.reshape(NUM_ACTIONS_CHUNK, ACTION_DIM) normalized_actions = normalized_actions.float().cpu().detach().numpy() else: @@ -980,6 +982,7 @@ def predict_action( pixel_values = kwargs["pixel_values"] # [1, 12, 224, 224] attention_mask = kwargs["attention_mask"] # + hidden_3d = kwargs.get("hidden_3d", None) # Create fake labels tensor (needed for action mask) labels = input_ids.clone() @@ -1026,6 +1029,7 @@ def predict_action( action_head=action_head, proprio=proprio, # [8] proprio_projector=proprio_projector, + hidden_3d=hidden_3d, ) # Unnormalize predicted actions diff --git a/prismatic/extern/hf/modeling_prismatic.py b/prismatic/extern/hf/modeling_prismatic.py index 17fb6a4..228c75f 100644 --- a/prismatic/extern/hf/modeling_prismatic.py +++ b/prismatic/extern/hf/modeling_prismatic.py @@ -879,6 +879,7 @@ def _regression_or_discrete_prediction( action_head=None, proprio=None, proprio_projector=None, + hidden_3d=None ): """Run L1 regression-based continuous action prediction or discrete action tokens prediction.""" @@ -929,7 +930,8 @@ def _regression_or_discrete_prediction( # L1 regression prediction normalized_actions = action_head.predict_action(multi_layer_hidden_states, proprio=proprio, - proprio_projector=proprio_projector) + proprio_projector=proprio_projector, + hidden_3d=hidden_3d) normalized_actions = normalized_actions.reshape(NUM_ACTIONS_CHUNK, ACTION_DIM) normalized_actions = normalized_actions.float().cpu().detach().numpy() else: @@ -980,6 +982,7 @@ def predict_action( pixel_values = kwargs["pixel_values"] # [1, 12, 224, 224] attention_mask = kwargs["attention_mask"] # + hidden_3d = kwargs.get("hidden_3d", None) # Create fake labels tensor (needed for action mask) labels = input_ids.clone() @@ -1026,6 +1029,7 @@ def predict_action( action_head=action_head, proprio=proprio, # [8] proprio_projector=proprio_projector, + hidden_3d=hidden_3d, ) # Unnormalize predicted actions diff --git a/prismatic/models/action_heads.py b/prismatic/models/action_heads.py index 5f24d66..9407bac 100644 --- a/prismatic/models/action_heads.py +++ b/prismatic/models/action_heads.py @@ -27,25 +27,42 @@ def __init__( action_dim=7, num_task_tokens=512, use_pro_version=False, + use_3d=False, + dim_3d=None, + inject_layers=None ): super().__init__() + self.use_3d = use_3d self.num_task_tokens = num_task_tokens self.action_dim = action_dim self.hidden_dim = hidden_dim - self.model = MLPResNet( - num_blocks=24, - input_dim=input_dim*ACTION_DIM, - hidden_dim=hidden_dim, - output_dim=action_dim, - use_pro_version=use_pro_version - ) + if not self.use_3d: + self.model = MLPResNet( + num_blocks=24, + input_dim=input_dim*ACTION_DIM, + hidden_dim=hidden_dim, + output_dim=action_dim, + use_pro_version=use_pro_version + ) + else: + assert dim_3d is not None, "dim_3d must be specified when use_3d is True!" + self.model = MLPResNetw3d( + num_blocks=24, + input_dim=input_dim*ACTION_DIM, + hidden_dim=hidden_dim, + output_dim=action_dim, + use_pro_version=use_pro_version, + feat_3d_dim=dim_3d, + inject_layers=inject_layers + ) def predict_action( self, actions_hidden_states, proprio=None, proprio_projector=None, - phase="Inference" + phase="Inference", + **kwargs ): """ * action_hidden_states: [B, Hidden, L_v + L_a, Dim] @@ -79,13 +96,23 @@ def predict_action( random_perturbations = learnable_random_perturbations(seq_len, dim, device=rearranged_actions_hidden_states.device, dtype=rearranged_actions_hidden_states.dtype) rearranged_actions_hidden_states = (rearranged_actions_hidden_states + random_perturbations) # (1, seq_len, dim) print("-----------------") - - action = self.model( - rearranged_actions_hidden_states, - h_a=actions_hidden_states, - p=proprio_features, - h_t=task_hidden_states - ) + if not self.use_3d: + action = self.model( + rearranged_actions_hidden_states, + h_a=actions_hidden_states, + p=proprio_features, + h_t=task_hidden_states + ) + else: + h_3d = kwargs.get("hidden_3d", None) + assert h_3d is not None, "h_3d must be passed when use_3d is True!" + action = self.model( + rearranged_actions_hidden_states, + h_a=actions_hidden_states, + p=proprio_features, + h_t=task_hidden_states, + h_3d=h_3d + ) return action @@ -130,6 +157,62 @@ def forward(self, x, h_a=None, h_t=None, p= None): x = self.fc2(x) # shape: (batch_size, output_dim) return x +class MLPResNetw3d(nn.Module): + """MLP with residual connection blocks.""" + def __init__( + self, + num_blocks, + input_dim, + hidden_dim, + output_dim, + use_pro_version=True, + feat_3d_dim=2048, + inject_layers=0 + ): + + super().__init__() + self.layer_norm1 = nn.LayerNorm(input_dim) + self.fc1 = nn.Linear(input_dim, hidden_dim) + self.relu = nn.ReLU() + self.mlp_resnet_blocks = nn.ModuleList() + # if use_3d_feat: + self.feat_3d_dim = feat_3d_dim + self.feat_3d_align = nn.Sequential( + nn.LayerNorm(self.feat_3d_dim), + nn.Linear(self.feat_3d_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, hidden_dim) + ) + self.inject_layers = inject_layers # TODO: inject 3D feat in only one layer + for i in range(num_blocks): + if self.inject_layers == "all": + self.mlp_resnet_blocks.append(MLPResNetBlock_Pro_w3d(dim=hidden_dim)) + elif isinstance(self.inject_layers, int) and i == self.inject_layers: + self.mlp_resnet_blocks.append(MLPResNetBlock_Pro_w3d(dim=hidden_dim)) + else: + self.mlp_resnet_blocks.append(MLPResNetBlock_Pro(dim=hidden_dim)) + + self.layer_norm2 = nn.LayerNorm(hidden_dim) + self.fc2 = nn.Linear(hidden_dim, output_dim) + + + def forward(self, x, h_a=None, h_t=None, p= None, h_3d=None): + #* [B, A_chunk, A_dim * Dim] -> [B, A_chunk, Dim] -> [B, A_chunk, A_dim] + #* 每一个 block 内部的过程是: + # x: (batch_size, input_dim) + h_3d = self.feat_3d_align(h_3d) + x = self.layer_norm1(x) # shape: (batch_size, input_dim) + x = self.fc1(x) # shape: (batch_size, hidden_dim) + x = self.relu(x) # shape: (batch_size, hidden_dim) + for i, block in enumerate(self.mlp_resnet_blocks): + if isinstance(block, MLPResNetBlock_Pro_w3d): + x = block(x, h_t = h_t[:,i+1,:], h_a = h_a[:,i+1,:], p=p, h_3d=h_3d) # shape: (batch_size, hidden_dim) + elif isinstance(block, MLPResNetBlock_Pro): + x = block(x, h_t = h_t[:,i+1,:], h_a = h_a[:,i+1,:], p=p) # shape: (batch_size, hidden_dim) + x = self.layer_norm2(x) # shape: (batch_size, hidden_dim) + x = self.fc2(x) # shape: (batch_size, output_dim) + return x + def apply_rope(q, k, cos, sin): @@ -426,3 +509,149 @@ def reshape_heads(t, B, L): # residual + FFN x = self.ffn(output + x) return x + +class MLPResNetBlock_Pro_w3d(nn.Module): + """One MLP ResNet block with separate projections for self, adapter, task + RoPE, now with FiLM modulation.""" + + def __init__(self, dim: int, num_heads: int=8) -> None: + super().__init__() + self.dim = dim + self.num_heads = num_heads + self.head_dim = dim // num_heads + + self.ffn = nn.Sequential( + nn.LayerNorm(dim), + nn.Linear(dim, dim), + nn.ReLU(), + ) + + # Q (from x only) + self.q_proj = nn.Linear(dim, dim) + + # Self-Attention: K, V + self.k_self = nn.Linear(dim, dim) + self.v_self = nn.Linear(dim, dim) + + # Adapter cross-attention: K, V + self.k_adapter = nn.Linear(dim, dim) + self.v_adapter = nn.Linear(dim, dim) + + # Task cross-attention: K, V + self.k_task = nn.Linear(dim, dim) + self.v_task = nn.Linear(dim, dim) + + self.k_3d = nn.Linear(dim, dim) + self.v_3d = nn.Linear(dim, dim) + + self.o_proj = nn.Linear(dim, dim) + + # gating + self.gating_factor = nn.Parameter(torch.zeros(1)) + + # RoPE + self.rope = RotaryPositionEmbedding(self.head_dim) + + # ---- FiLM ---- + # FiLM is useless; to avoid conflict with chkpt, it can be kept as is for now. + self.film_gen = nn.Sequential( + nn.Linear(dim, dim * 2), # output γ and β + ) + + + def apply_film(self, x, gamma, beta): + """FiLM: per-channel modulation""" + return gamma.unsqueeze(1) * x + beta.unsqueeze(1) + + + def forward(self, x, h_a=None, h_t=None, p=None, h_3d=None): + """ + h_a: adapter tokens + h_t: task tokens + p: possible conditioning vector (for FiLM) + * x: [B, A_chunk, Dim] + * h_a: [B, L_a, Dim] + * h_t: [B, L_v, Dim] + * p: [B, 1, Dim] + * 三种:[B, n, A_chunk, dim], [B, n, L_a + p, dim], [B, n, L_v, dim] MHA 方式,加入 RoPE + * [B, n, A_chunk, dim] 的 q 和 自身的 k、h_t 的 k、h_a 的 k 分别做点积,得到三个 + * [B, n, A_chunk, A_chunk], [B, n, A_chunk, L_a + p], [B, n, A_chunk, L_v] , cat 就是 [B, n, A_chunk, A_chunk + (L_a + p) + L_v] + * 而 v 三者 cat 在一起就是 [B, n, A_chunk + (L_a + p) + L_v, dim] --> [B, n, A_chunk, dim] + """ + g = self.gating_factor + ratio_g = torch.tanh(g) + + # concat h_a and p + h_adapter = torch.cat((h_a, p),dim=1) + + + h_task = h_t + B, T, C = x.shape + K_a = h_adapter.size(1) if h_a is not None else 0 + K_t = h_task.size(1) if h_task is not None else 0 + K_3d = h_3d.size(1) if h_3d is not None else 0 + + # Q + q_1 = self.q_proj(x) + + # self tokens + k_tokens = self.k_self(x) + v_tokens = self.v_self(x) + + # adapter tokens + k_adapter = self.k_adapter(h_adapter) + v_adapter = self.v_adapter(h_adapter) + + # task tokens + k_task = self.k_task(h_task) + v_task = self.v_task(h_task) + + # 3D tokens + k_3d = self.k_3d(h_3d) + v_3d = self.v_3d(h_3d) + + + # reshape -> multi-head + def reshape_heads(t: torch.Tensor, B: int, L: int) -> torch.Tensor: + return t.view(B, L, self.num_heads, self.head_dim).transpose(1, 2) + + + q_1 = reshape_heads(q_1, B, T) + k_tokens, v_tokens = reshape_heads(k_tokens, B, T), reshape_heads(v_tokens, B, T) + k_adapter, v_adapter = reshape_heads(k_adapter, B, K_a), reshape_heads(v_adapter, B, K_a) + k_task, v_task = reshape_heads(k_task, B, K_t), reshape_heads(v_task, B, K_t) + k_3d, v_3d = reshape_heads(k_3d, B, K_3d), reshape_heads(v_3d, B, K_3d) + + # RoPE + cos_main, sin_main = self.rope(seq_len=T, device=x.device, dtype=x.dtype) + q_1, k_tokens = apply_rope(q_1, k_tokens, cos_main, sin_main) + cos_a, sin_a = self.rope(seq_len=K_a, device=x.device, dtype=x.dtype) + _, k_adapter = apply_rope(k_adapter, k_adapter, cos_a, sin_a) + cos_t, sin_t = self.rope(seq_len=K_t, device=x.device, dtype=x.dtype) + _, k_task = apply_rope(k_task, k_task, cos_t, sin_t) + cos3d, sin3d = self.rope(seq_len=K_3d, device=x.device, dtype=x.dtype) + _, k_3d = apply_rope(k_3d, k_3d, cos3d, sin3d) + + # attention scores + attn_scores = [torch.matmul(q_1, k_tokens.transpose(-2, -1))] + attn_scores.append(torch.matmul(q_1, k_adapter.transpose(-2, -1))) + attn_scores.append(torch.matmul(q_1, k_task.transpose(-2, -1)) * ratio_g) + attn_scores.append(torch.matmul(q_1, k_3d.transpose(-2, -1))) + attn_scores = torch.cat(attn_scores, dim=-1) / math.sqrt(self.head_dim) + attn_weights = torch.softmax(attn_scores, dim=-1) + + # combine V + v_list = [v_tokens, v_adapter, v_task, v_3d] + v_combined = torch.cat(v_list, dim=2) + + output = torch.matmul(attn_weights, v_combined) + output = output.transpose(1, 2).contiguous().view(B, T, C) + output = self.o_proj(output) + + # # ---- FiLM ---- + # gamma_beta = self.film_gen(p) # [B, 2C] + # gamma, beta = gamma_beta.chunk(2, dim=-1) # [B, C], [B, C] + # output = self.apply_film(output, gamma, beta) + + # residual + FFN + x = self.ffn(output + x) + return x \ No newline at end of file diff --git a/run.sh b/run.sh index ee70ede..6d9429f 100644 --- a/run.sh +++ b/run.sh @@ -3,7 +3,7 @@ data_name=libero_10_no_noops export HF_HUB_OFFLINE=1 export TRANSFORMERS_OFFLINE=1 export HF_DATASETS_OFFLINE=1 -CUDA_VISIBLE_DEVICES=4,5,6,7 torchrun --standalone --nnodes 1 --nproc-per-node 4 vla-scripts/finetune.py \ +CUDA_VISIBLE_DEVICES=4,5 torchrun --standalone --nnodes 1 --nproc-per-node 2 vla-scripts/finetune.py \ --vlm_path pretrained_models/prism-qwen25-extra-dinosiglip-224px-0_5b \ --config_file_path pretrained_models/configs \ --data_root_dir data/libero \ @@ -18,7 +18,7 @@ CUDA_VISIBLE_DEVICES=4,5,6,7 torchrun --standalone --nnodes 1 --nproc-per-node 4 --image_aug True \ --num_steps_before_decay 200000 \ --max_steps 200005 \ - --save_freq 20000 \ + --save_freq 10000 \ --save_latest_checkpoint_only False \ --merge_lora_during_training True \ --batch_size 8 \ @@ -28,8 +28,10 @@ CUDA_VISIBLE_DEVICES=4,5,6,7 torchrun --standalone --nnodes 1 --nproc-per-node 4 --use_pro_version True \ --wandb_entity "my-wandb-org" \ --wandb_project "$data_name" \ - --run_id_note VLA-Adapter--$data_name--$(date +%s) \ + --use_3d True \ + --inject_layers all \ + --run_id_note VLA-Adapter--$data_name--$(date "+%Y_%m_%d_%H_%M_%S") \ # --resume True \ - # --resum_vla_path outputs/configs+libero_10_no_noops+b16+lr-0.0002+lora-r64+dropout-0.0--image_aug--VLA-Adapter--libero_10_no_noops----100000_chkpt \ - # --resume_step 100000 \ -# > logs/VLA-Adapter--$data_name--$current_time.log 2>&1 & \ No newline at end of file + # --resum_vla_path outputs/configs+calvin_abc_rlds+b16+lr-0.0002+lora-r64+dropout-0.0--image_aug--VLA-Adapter--calvin_abc_rlds--2025_10_12_19_33_45--110000_chkpt \ + # --resume_step 110000 \ + # > experiments/logs/Train--$data_name--$(date "+%Y_%m_%d_%H_%M_%S").log 2>&1 & \ No newline at end of file diff --git a/vla-scripts/finetune.py b/vla-scripts/finetune.py index b1472ef..d1d6f88 100644 --- a/vla-scripts/finetune.py +++ b/vla-scripts/finetune.py @@ -3,7 +3,7 @@ Fine-tunes Qwen2.5-0.5B via LoRA. """ - +from typing import Dict, List, Optional, Tuple, Union import os import time from collections import deque @@ -127,6 +127,9 @@ class FinetuneConfig: # revision version use_pro_version: bool = True # the version number phase: str = "Training" + use_3d: bool = False + dim_3d: int = 2048 + inject_layers: Union[int, List[int], str] = 0 # fmt: on @@ -190,6 +193,7 @@ def get_run_id(cfg) -> str: run_id += "--image_aug" if cfg.run_id_note is not None: run_id += f"--{cfg.run_id_note}" + run_id += f"--use_3d_{cfg.use_3d}_dim_{cfg.dim_3d}_inject_{cfg.inject_layers}" return run_id @@ -333,7 +337,8 @@ def run_forward_pass( # Get ground-truth action labels ground_truth_actions = batch["actions"].to(device_id).to(torch.bfloat16) noise, noisy_actions, diffusion_timestep_embeddings = None, None, None - + pi3_model = kwargs.get("pi3_model", None) + img_1, img_2 = batch["pixel_values"][:, 0:3, :, :].to(device_id).to(torch.bfloat16), batch["pixel_values"][:, 6:9, :, :].to(device_id).to(torch.bfloat16) # VLA forward pass with torch.autocast("cuda", dtype=torch.bfloat16): output: CausalLMOutputWithPast = vla( @@ -349,6 +354,20 @@ def run_forward_pass( diffusion_timestep_embeddings=None, use_film=use_film, ) + if pi3_model is not None: + pi3_num_reg_token = 5 + + img_tensor = torch.stack([img_1, img_2], dim=1) # [B, 2, 3, H, W] where 2 indicates 2 views + B, N, _, H, W = img_tensor.shape + img_tensor = img_tensor.reshape((B*N, _, H, W)) + hidden = pi3_model.encoder(img_tensor, is_training=True) + if isinstance(hidden, dict): + hidden = hidden["x_norm_patchtokens"] + hidden, pos = pi3_model.decode(hidden, N, H, W) + hidden = hidden[:, pi3_num_reg_token:, :] + L_3d, dim_3d = hidden.shape[-2:] + hidden = hidden.reshape(B, -1, L_3d, dim_3d) + hidden = hidden.reshape(B, -1, dim_3d) # Get action masks needed for logging #* batch["labels"] 是 L 个(L_a+L_lang),第一个是 BOS token,这样 :, 1: 是索引第 2-L 个。 @@ -359,6 +378,8 @@ def run_forward_pass( next_actions_mask = get_next_actions_mask(ground_truth_token_ids) # Compute metrics for discrete action representation (next-token prediction) + + if not (use_l1_regression): loss = output.loss predicted_token_ids = output.logits[:, num_patches:-1].argmax(dim=2) @@ -424,6 +445,7 @@ def run_forward_pass( proprio=batch["proprio"] if use_proprio else None, proprio_projector=proprio_projector if use_proprio else None, phase=cfg.phase, + hidden_3d=hidden.to(torch.bfloat16) ) loss = torch.nn.L1Loss()(predicted_actions, ground_truth_actions) @@ -764,7 +786,7 @@ def _convert_path(obj): # Initialize wandb logging if distributed_state.is_main_process: - wandb.init(project=cfg.wandb_project, name=f"ft+{run_id}", mode="offline") + wandb.init(project=cfg.wandb_project, name=f"ft+{run_id}", mode="online") #TODO: set online when necessary # Print detected constants print( @@ -927,10 +949,16 @@ def rename_state_dict_keys(state_dict, replace_map): "hidden_dim": vla.module.llm_dim, "action_dim": ACTION_DIM, "use_pro_version": cfg.use_pro_version, + "use_3d": cfg.use_3d, + "dim_3d": cfg.dim_3d, + "inject_layers": cfg.inject_layers }, to_bf16=True, ) - pi3_model = load_pc_model(cfg.pi3_path).to(device_id) + pi3_model = load_pc_model(cfg.pi3_path).to(device_id).to(torch.bfloat16) + pi3_model.eval() + for name, param in pi3_model.named_parameters(): + param.requires_grad = False # Get number of vision patches NUM_PATCHES = vla.module.vision_backbone.get_num_patches() * vla.module.vision_backbone.get_num_images_in_input() @@ -1070,6 +1098,7 @@ def rename_state_dict_keys(state_dict, replace_map): compute_diffusion_l1=compute_diffusion_l1, use_pro_version=cfg.use_pro_version, cfg=cfg, + pi3_model=pi3_model ) # Normalize loss to account for gradient accumulation From 37f7e4c0565798e0d537ea30ef782455b43a7e0c Mon Sep 17 00:00:00 2001 From: ruiheng123 Date: Mon, 3 Nov 2025 09:14:52 +0800 Subject: [PATCH 4/6] feature(wrh): add pi3 injection --initial version --- .../vision/__pycache__/__init__.cpython-310.pyc | Bin 563 -> 0 bytes .../__pycache__/base_vision.cpython-310.pyc | Bin 10254 -> 0 bytes .../vision/__pycache__/clip_vit.cpython-310.pyc | Bin 1056 -> 0 bytes .../__pycache__/dinoclip_vit.cpython-310.pyc | Bin 5692 -> 0 bytes .../__pycache__/dinosiglip_vit.cpython-310.pyc | Bin 6196 -> 0 bytes .../__pycache__/dinov2_vit.cpython-310.pyc | Bin 888 -> 0 bytes .../vision/__pycache__/in1k_vit.cpython-310.pyc | Bin 967 -> 0 bytes .../__pycache__/siglip_vit.cpython-310.pyc | Bin 1114 -> 0 bytes .../vlas/__pycache__/__init__.cpython-310.pyc | Bin 199 -> 0 bytes .../vlas/__pycache__/openvla.cpython-310.pyc | Bin 4941 -> 0 bytes 10 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 prismatic/models/backbones/vision/__pycache__/__init__.cpython-310.pyc delete mode 100644 prismatic/models/backbones/vision/__pycache__/base_vision.cpython-310.pyc delete mode 100644 prismatic/models/backbones/vision/__pycache__/clip_vit.cpython-310.pyc delete mode 100644 prismatic/models/backbones/vision/__pycache__/dinoclip_vit.cpython-310.pyc delete mode 100644 prismatic/models/backbones/vision/__pycache__/dinosiglip_vit.cpython-310.pyc delete mode 100644 prismatic/models/backbones/vision/__pycache__/dinov2_vit.cpython-310.pyc delete mode 100644 prismatic/models/backbones/vision/__pycache__/in1k_vit.cpython-310.pyc delete mode 100644 prismatic/models/backbones/vision/__pycache__/siglip_vit.cpython-310.pyc delete mode 100644 prismatic/models/vlas/__pycache__/__init__.cpython-310.pyc delete mode 100644 prismatic/models/vlas/__pycache__/openvla.cpython-310.pyc diff --git a/prismatic/models/backbones/vision/__pycache__/__init__.cpython-310.pyc b/prismatic/models/backbones/vision/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 0e29c88d679c813560eaec3548446bca571468e8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 563 zcmZXRO;5rw7{|LYV2m*!#t*TBHgNJ_jBgi|pb2D$#$+k1j)rzy+JW&SdGWJ&-qn*| z!INzj1EEcy{Qti`Phay+#{;U}y?-pvEC4>W*=@Emmuko-6%vqG3~&os+(x#gz0GRe zL5|TiR_6`WFxp{F?jqOdI%{zcc}6!_oBPN&y2&~`K!MWk6ojoWwLb{$Dj0E`l9|An ze9i^0f{03*XV-YKn&%k_t*U!79z8~Mwu6ptX_g=2V~wXYJ;?9tiDz-7(}6l=O9S^9#r6mA%X(fI1*D#1lb1S|xV9G7$fd7cm^ n;r#C@CH%btjANS7GLBD+*UCLtS!+fv^moWWg&y#&ZSPq>_tKgt diff --git a/prismatic/models/backbones/vision/__pycache__/base_vision.cpython-310.pyc b/prismatic/models/backbones/vision/__pycache__/base_vision.cpython-310.pyc deleted file mode 100644 index 797690a7110d4d04a0ab084f9bc4f00b6989d1a6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 10254 zcmbta-ESP%b)T>O8ZMUry76t4g57kQ_``o{v$WtE%C{Un43%&FuMbX$v)ZaO?yIfL~ zQ^UKscklPdx#ynWIp=OXK5lFHy#5cr*!by^ru{o*Mn6SlE+HgsUDJf-YC;!As7Hot zM5b%%ls7{wD!2tsTVXLOxg|~)!g5q`E0OKm(U?0HRo!Yd?v6(_w-!yf6VaqQ$?b~a z3HJoXDTY(-6#q^~Gww{k?qoD;Xu1}ia_6|D6wXHr?gFRF;pu46U5w7SXE?1rJ^x7%^*Q}=}W=VL}P-H1i9I z9=q|oVTu`2shWO6I?Y&N0za>B?>K9(z4>N;llGeO+X)&(QaMp9WXP?Xyi@7OJ4`2{ zoBfcOx^86V;sveSutzbo4f={4eLJ!s<>oAQzljyE%!?X0LI*rVM5#|=LQQX~LOgybwD zp{JTK5X@9-7Vl|IU09-Ux4NhA>R{O7YG(PWm1M=OZTeO7=+i(LerO~M;VFdVJfaV^ zT`kad^`Gm1rSD`Nhk^?|{cvIx0va@Nr{> zIVf3K@86tcCBJki7`=d5iH@lw6%iz%&D8W+y?3VD?x3T`i(0;j)5cYJWx3zBt}7yB z-Qb0(r;%0;MMMa74pCMe9pDT~c(7Wx2U72KKIg`_MPO=gX*9&pF zMBc%N=A&>GA-RJn(Duw-?N#l)ukBeNw6<%_Yn%GMF7z!UnSLDyyRZL9ACwDo9>m`& zBo1nNMWIAxu%YZ|s`8)D|6ry@j5r?T~DS8eOUF@{60qW^nZY@=iX&w_b@wW2Q zaHuJt#vHCmr<+P|cyIcBX`rNr$kNODg0Y~_>5HbNt7Q~55Dt9^TQ4CbR}e|f)q$X3 zY-vhM79{A!h-1j2Fi}?8EX#^4(l28V;ZKn-nC04x2&JsX0%a&aGBb!;%=`IWqZ>Q; z5Hk=lzmM=s;!}b9^gv(btv)a_E0JMSy@ZN<0Ebc>F$KzAE+V|Iq5PLLux0d49iD!9 z&}OwhmZ3x2kEG{8b`2z&msLIQeZWs%qsD0tqMoWC%1j^`?s2Rd8(DEpwv$+?GS$+Y zBS+#;mR6-!5dA(vGKr|H8G21O9vKL;M(wv;`{YCT6LdK^1lJUXv_SX*K%S(S=7~ps0Xez=+ubLPGlUK8acOe%Be0!Kqz3*QKngOT4!~;>D zFp}m{7ilNu?E^IZG>2#38Soxm0tcLq7QmIP#sXChs%CevB1I`Ip#sjK*C@1fiufI#g3*7lP|UW7CA_s?qZAY4;$GNIAV*-ql~# z_Vtf|RUbk*o!3$ei1NZcy^mJqyp-D9(9*(Yk@BK=pOB`sdMp4=(-KJDw;(s)8{5-A z)Hku1rXB#m2)ce^ypQfo8i;zL1i;Ezekm&ixdM=1F{Y%0+FZ!k7ei=Go(hL$zi6R02huXY0vj3b2hPsSKnH~d+Uqp&a^D9iu_&Ne_cSh@7 z`C{7@P{EjFAi;ooyz>WoW)d72>RU8*0RYk1!fDV#02rmXn?j-|iby@6|8TJ3f?oK- z^dF9)uF*^-wMf%jv^YgH=WCQ%)QnfAzDI?>N0EaF7mRcl^%_OYAat!*0`j%Bkz3Sf z^qMuR*NoqBt8(U8txu?L&>(+6(H|npN}ea;hUc+}1ZuQXb%`nu3uN_8DkJi#ixhpn zEdDl{P}_u>l>vxT#y1O!N*f4=KEyhg5R$JV8pz;xGzLOTT20(iSrGa~C?O1n#i2|t z^?|SYhD@QZt;ct+`y!j__<}3~uNgp#WsO2z!O)MRAgzsr64R0pU>8?61>0_d;SJ*l z#-6cfLLoACtzBadm*F(ZIcE-XhOt-JD?-KL(6shSskv!El$Ibu3w&uT$PTM_^Si-q zNNV4?{@NSMi~ulhd;!J9NdSUMI_o=5zHl^3b|TmxWE`!>O2C5gRosPfMP}263|brM z%61^qjpc`Q`Lc2YCK@!~6q3cFZD-X15lC^Z7Xj!{2QxGB@2FMkVXoc0mYHazt|MD7 z=iEhP%Jl6_-*Ag)>TPehH7Hd=l7T}dJTb( zf7W~7;9{8%9u42l|9CE z1RV59{sqy1=_-1~5DTPl2@3&Q7(m|=B~fnLEDs6P$K8S`$szz{OjH5!PzsQTM?j2= z+FfYFu8oojQU_2@N`WON#R;w%mo;uZ0UfW;+UPeWrl}S^C;RmlN@v8$e(8yk(phn; zUpmz<9ow8n>p3ysFQ2&wVeFm+)1A(0xk)*&tRMaxHmfcm%BFJr#TytjULdlQj7J{q zB=Cwyj)YXUcCx9y-Re7n9g2hkjO{|wh5pWR35K4K?KEfxD7b}c7jh=|sP!$1hW!8!6{xG^GXaY^RL6q$5>h`zV5QzJ6`{QF*R=KCsl&7# zsx610QAcNzZ&zs$2N9$K-I8hle1`jtQIX#REPB%DJ$+=s$8>pY?X-ScyLy=-I*Xh-!7$}eiq|kl z12z-UVGl%yByjGp=%NV&x~2D?KV}if^tna@0D&qT(&k?TpnLgxN2ypnF!Ccgj|Co)gITG+EV8k45}3W&BP8fI`Es;Bd`EGG z;}HJ2xV=Y<`Z}V=DF37$Uuqo{8Ldk}miJ!_C!Ra9-(iz4HPcwT8v%mYX8&L~*TRvx zMjCyo*=9EUu<3P_Y_JE_6X{Mzel(o+nIqF4*BZJfiIV(w&mKiQR;~!~a@C7WS+z+K z$xcOjy4s@XK>unZo0Y-Ckc;U~4w-c2RfnSYskXq&REpB$efidqP>T98&Y_kqis%BX z_YhG?$VP$vTrhO&Gs7@`YnaAw49oa6N`XW*WjJkt(QRlH5 z80OGtqMu8Cfy5~2R!qQd4!N+l0KFx^ZW*BhR|xb!VGFZaV+bbD9fvh;%hCXHU4q++ zv^U3O6$AT|GFX~2s*9GkMOZuSe0&;o85x{jv;dbJPC;Q;Q zR~4a7HkrcJO3y)U{KN4~VDX93Z2Hd} z`_$h?+WvTq;5)6RbRWJ#D)UaHQ}mlo?;FWYcnZtpDa80Q zAD^W1PsraeD=J8HXwph%g?&+@Hk+r!X|hN+XMe7frWNFF;F6$ujoVuC=c1OU)Zb&C zQaXoOKEf;$JPS%T=h?u=+e-2?as#Au=>mFG1~r?f#pHdX1yvm_PY8G}=e56s7Pd#K z`=XcvNdN0_J85{jhX(F42F@inJweT@Kg`y z|DVD9f@n93{Xas{GVf$K}EZ#hBJ*@?T(cH9kx69!v&E`pWR?R4PZ zMYp!xc5denS4v`m39y(So0c5dJ`%n&Op$b@gV&*80M@JvDCnyY;Hk`?rJM_il+LX$ z>gB&xKlhOC9u(0wyar5$Y%EATrWCY?YgsV~T4B&}t)Tt#R`1z&+VW0EHlQ&$w+`d( zP+Pl2j_YxL+Je@^+hM5aRdap(#kLRar8m~kH6Zu1N;t54Gb@sQyZ16v^cFo>d5vc4 zmJ@*xq|UH{1FEa&;AM__oI8Yyq+*fIegh$yL&T1p@*ccpyKn-|fU$CCPF;Wa2-Q$h z39`UBm+_P7caEq(>MDx=1A$eaIjy&JL_iLe=HpyeeTTC1Hrpf44UmfL5$9HK`qqGl z`b;6kIi9+HPBk!Hj-lI>BE>8B`IOh2cq&X^{<=z0mRTf8 z0l%#%B8Yp6m1TN&jHC#t2g~M!oj6%>ogVNKMUc!M9yQ5>&^fsCII%dq-JiqD&n9j+ zgH~1^9F#lRU&oLg)C`Sa5A%sSNrSUib&FDL5uBmaTZl6ICcXWzK3ca&>>MTncNku7 zripv%34EB1;q8QWOP5;_C*&G}L#cBVT|)F@goN~pVg(L%{uX~-EEe(ZVdKT4s@sn& z%i?lN$7{%EcF8C#>g7i@QYs!z7H9A>GGk7`D`4m|hViHhHN$#jm#GA4xQp^K`;lF; zQ8EPuW6oSKr_A2mlbGlbFVh7bT3-0~0zfi_?*;&+-#3V(Q@|o}u)64NF{MW;qs2~6 z_Sd}oHoti~zLT&FajxBX^}3TAG>|3ah=)r*airhaz?1Wu!#u!#dpK#@EN(n9QlKM#|-tp$zkp3@WTmVjP`9h8O z7|O~gZ2by&>jRtuYVk{L_VM;Yue)`#_XC)ABd;O@Lv4j0<_;c3a$66G0DH$7I5b9$ zIgg&RbCd4Cnq%q8Zw#Q_Iu@wt=JQ*fiYVm_BW8QL-2aV`kosjCwmu6swI20O9i`Ee z>^YLIBoKLkoEp4OykP$siXlw@5U{~PK`W7x*y=W_Tmv&3QZ@6eUd(U}4- zf_$H}2y#N?P8K64KnE4aE`lm_^yfvCjo$pxjX&A28*ZIAH=lKrj@+EVLFcnuJi9x_ zCY{Gau@RFW&2dB_69<1R%GU$`#_6N$`TrPy%ckjRTV}Upy^Tk=JE}hY!rW+ous> z^tr)bYjR0T?3Q0m(hy@e<%zRCX)VxyanW#rr)YITlUl`rPF^9op4O9&Dz mZPKm+tS{(gqm17vgh>GKB;18BShJPM+GNQ%iIVEk?3SBN!O^8;c{Ze7~G!&-;PZ5N$(5Q>ycMAlC) z+BX1#T&$OmL=+v1qa7NYo+L3Z9>N83LKP<|&4TgMEmA&PU*GH}DUYcr9+?{Fr80P5~UD3QcKzLJ6nrq@J7fv4n36r89` z(u|KTbeHiV&BKgT$L3JiCC_ph4+3MV!&97DQ~Ur;t*KpApKXS} zVA*9>3~ZSFu?D6hnARB-jhSY;eU^*hl#F>n^y|PIClU8#F2+0_ozr;aO+xA&?{D|E8BH@Ty;KSn(M$whlrSDDZ!V|uDt*0$ zpP^iWCaVzYE~GlPun7yV;`Z<2{y#4D=WJXojjnFP5R7TWfnb{u=$D74ULxcyr{V0x z2G_JL;;d|@J2O534?{v^7l5j?Vyb&48M92GD)%1aa%bSybb}C<1cdm#TblifX8)FE tzoH2n@E@vbX&Kng9DAW@+1~Wh&1cp8&#~!jw2S`k&Q1e*yBLBd7oX diff --git a/prismatic/models/backbones/vision/__pycache__/dinoclip_vit.cpython-310.pyc b/prismatic/models/backbones/vision/__pycache__/dinoclip_vit.cpython-310.pyc deleted file mode 100644 index 4c628baa3927468f57d7d23ab79a43d11d02d73a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5692 zcmZ`-O>7&-72ZGoilQirk|kNT*G-y+X-vh58=$T0CjLoa$B+%n@nL~twc-q=rT2%K zUCK5=Un&>rp)QhhTfixD@g?Z3=bqa`&vPo!UVQGYhq&LHB`HdlN$l**n>RCW-n@D5 zdmGQrmNooteD}}Ytu;;iH#Mez78)PnN&bO?X^zHpW(0a@IEGGjGcZHTu~gj(a$(-d zhXtn)7M-G^<${t^0$(mDJ7v7{!Aw|jDypv#%!XB`8qPU$s$C3f;k+{+)}4B|;4COw zDQJX?&SJRaEU9)mI1w&8%i)T%60SO{;YsHt=rgRsW?A)*hOawo-1t;ubFB76W3_!_ zUw2NSJ6~V{O|7{w*eWwWirazTbNBqT)q7Mf-}4hcj_hr(eSbTSxSj5L zsV#Wg7g1ujm7CXZx5_V|Gp(u1 z67y289e7ECMxp14)c1m>DT`OUAn>*WF0HG6JC(V+{a(N~z?8cldON&n$lT>1Zr_*l znP7K?7bTrogj`_wf*Z!HA8_}9@Oo}94*d2bS$1PFuCJvB{I1te<7rMzUbqs6y*S}A ze}^ajfHy5!RSQqnk(HY~O}W^PAKvzutY%_d9;ruG`cV&cHzm0Tjj=k2~^8MB^ho$qf`-b97j%@wvuLZZZ90_BJkR%ye=*51NH~ju%*-7g>Rqc#)U8 zGaX|^>*QIHm7Z8mg_U6^Gji!FR;3d?q8qhe*Qb+mCz_TGCH4{OU0LxHwI47Px2Ouq1M%gdPn!Qq4BN$dmU|ky3O=I8j#l5 z5DVaL8sY?AVwviuv=Sb4#41rvqIj-}(|G-FbA2}s`MT))yFA)?;6*#@dx5up@8-wn zK4xAI=e|A>c^$UKgJgYtHInr`bywEMhd#PPvgo>Pn6>M^fh7{!(_8xB%<(F0jd?ek zvotXBLhib-;jv}Avf{d5^t~YK5p$SM&bh7^MR5vE#!=$BVuh&Ib{q$yKvgqIg&^(8 zTpEk^uFT)%Q4$MbgQm7VRh%K(5{grJ67q&+t!NbWivFgd=&x8>2|V;m{(@p`X>5rZ z++-#m3)Yom`A+dmt)sI7D?XWZa`p9)=E;vK-#H ztWPb_i>Mc$mA=t1I^)1~KEzQ5M|s3ClU9hoJDXP3du|_xvaIdutoRh+foN_m&A7@V zu5$NPt}?iC-T7yA8u1OrxeKg>dV!vfiE@FJFOozVPg(Iaj_@?bJl#ch3YQt9zSSv! z`8r+e$F%49NnhhoAL(F;&E7Y@(uJO$7+UG_zL9))Lu1u_{ntdvDav0#f&7YMbyrw! zMN3z~`L`Qd#x+IzJJDzd^mL5J`E)s59p;Claov-w_SD$XRePRkzri{ES>M+m7e;)! zY_16(wWs-t>DsU`Ebbf7*D0oL(vFu{UF~5Gebz8{q;Em>`Q!b4r6*^wpT_hKL2H7x zNHn&D85j06=;SA=-k^H&Q}mxuV@|VWw(>-2nk}tpSG9eTzm%S4t0Q`2ie66Fu(LD6 z(!QSUns%!wWktdMh;krvYO|BHr$1 zAH4H<#2@x}o6yPD_;!+aJ{qF7&K}QuT;SR|3rS}E2(bVFLwVd6w3ETiXzjL$d$JUa zQNrNO*Ge%7ky!s&F17v?m$X9eMT0lS?h3mCj&j?boh>``lLR`jw}@ryY@1{n|44u2 z{WtOamm=T}4*yrh$9jqg%CrUy;A`UxM9b&;iOL3X0enY_m6^G;+%J)?i`F_7fYy(N6lkUMye?XVomtvoZ$~7ZhhR z8RZHgRsuHez3ZFTZ*91jKfdzWai<%mD&kOqenh>#Q&Im#n;>T3bDoR7n z(xIC=S>}L#_*Tf7l%RCx4xDOsN@JAoR1Y&F^P=nD@q@rAWhzG%=3WBsqI3ktJmVuUmR&J;djU9uTPYE7dgRMI%o3lMT@k5-G{mJIl+jf>bW$7MeQ;E&J z_O1$<`!W0!MD60{KqMD{MA2~;P(saVNJUz7%c9i}d@gQ=U%%+pDo*LeR z_Z%@k!1M{Bw5n0nYx>~b;|}9>gJq@XKjeXnYu)F`p&W*eX>wF8G=|zU^1y(5-{_A4 z2%aH>A>fA=fM3@nP)J9`EOl79LFY~%c{i#0V3W*N7j9iGYa<;8uqPf7_jh))W6MzN zB#VD{l)+QHIF!Sye3U1xgD@@5Lrfft$^5WsiR);Iqj3TYz!2UT6;Z4i(l()}7u z<5biY8?}yTU)%!!V>}Z@*Nwr+;}kveY=?}Vq8mjWW%Ohr037)GvVsst8Q%p^fH>~p z^<2-yI~+aa^4QBvzju7FK5~2{AM#VJYjpJ!kn*{{0Rtv= z$`TK^IYas%W}4+79#z&ZXurjyB&!*NQ^zT4GSBNp`2pt0MCroZx@K4J8uyK)GQzwN zu?w#tcEC0|Z8@XP&sCFW0BwAr$_|c`wu|AvBWcOlmyPmNx;yR82<(mKdPQP7V0E~3 z4O7T9mdeO1uejX_VlO@ap*ThW+1T}h4!$n%wo1uemOkq7uSp^@n!GW1`#Aj_Hwaer z4u&)v;w)Z*^e5h+f<8frOH^E=f_5rADsoiN?GO|@l$pb+0TgnPJ`%JY(V>ElRFI}* zPKhT1s?UxBQJd2jgDEp`h+T!HCTcW)LP)Nt*?RTX>AX=`Mfj*?kF}}`;%c6e1=nS9 z8=nwnD8!HauOKbRQl}raQ+y*xGN6%%lHd`tt*BinVLt`$=(gX!elz=?aGKyvEBgS_ z8l}hw4oPHAx7Eze$6RHtTQ5u?3@Fecb;>9dnT_Ph? g-|n9&@BqBR3P8sifPzkF}RgV>dg|Oz-!bN9M_lrS2TymDe zWoJ2DaaJ_06g0wBXEj`N)^xudtcM%UM!4y0hFi{7c+NQ&Zado;tMDpc_=4eG)9=ME zm~&pR9~!*Qmp(W6(h)l{oeStM^A+@0#MY7JT;#cXMyoOAW$q86*zX1Y&^`2%_V97J z{J@X>XwcmEIuG}wfoLWNUec5z8OcH1>_mf(mxuu#-s}nwSP?h7G76jfQF73{d29Eh z!>dhiz?=8{-kn=_+vVqwq0ur`iF=9H3A{K)r!e$n;(I~MQpFFvAn^7Bq3oM}CsDck zqhTO+L6o}{dOgu%Dt9A@IuF%S%G!PD4dQMjLm@GJ#SJ4q3WWQa^oH&*3jEGvRdype zZEq*X^sYBbqIpUzUicsihfyq4{$mmQW6{c~n*Q+YJF0p|B#Dsw(WAQ_SGAOl8<~LA zMh6Rw65+<;(~%f-ggf+-&Vh(kbu<`)%1vlT<2N8-H}GUHBY5YUD@E^`OFB5b+71r+ zJJ+s;>O7If(X}gA!gL|fWh%OQ_1ZJ0Y?`3*Dee!Ef9Jc`T5P<2tmDgA`f?my#yZ1C zXANM+@KJ{pHvcQ4^L;$=Z2)07CR~DjVhBsv+`OK?*>!_kPEO>3v(e6p0?&&gFNl&T zigLfwWt&Df&x^eDx$RVW83tHU8#l3))KE@|yg^*MJ{f>JQ=nSQ1aS&~Use6s9ZEkA zy~OV*%MW`h|L`-&I#&7Ab~_BrPk$B~IXpkY6Tb;CG5W^D?3%tYVZStgYNBt>_qq86 z1IKJvu7bM7BbSZWGqtkD@>psA7g z7FRM$jBjvDSlq^wgQbzR6?m~*{)y2wd5M=lUvP4;xqO#GVilHF;0v%nOO*O$vOuy& zv}}aSMT}NN6_^EK)4yxMLYWSnHiBkNkJ-Rb=p@YI4AbDN8dKMpQhy0lYkXbfmVaW{ zhO+`|*;K2?YI)$_pGJ)5zs0Ven*eG#RkJ(QyX$kcnp!skE25DLK_l4q9;;<8y51;A zGN+(HwVaa90#6_Y@*>vV?T~J!zbYDeJY@CpyMT#tr0sjcCe{Q#{1mAyVG|Sbfs7`G zYh^u11U);+_4Duu$eri#hG##ufiI$6{Ic|wf!QeqeE$SR85HG=qLNgJzQ2&v^n3Bh zq$yg&Zx@$Z&H;(mFq8kxlA*@!aR3{m(VWI)3pFsc=d*61%;^3Y&!uF0Qk;~I z803GR8}~?p=lF`=!*?;3o8(W8HT2luPxtpTGyV(gXLWvuzf%3u__ zy4*n_P((sP5+94_6t7X*n1ko!jjHSqP&()#RY4BZv;9Hu!|@xR48)_M=un}xchdD| z`Ec(NmRj%!NF^wIbg-6fEIY2I!Q=A4^AE-Nt+SRoE5Ght8ozxO?b%ru)RpWz9T^Q( zDVSDCS$yutX5<5&uTpq6wy-QHd^^y3%;Ywi)z-lYzy zV)`RtQy{#D=YKW;dN7y2Dn2n2BztZ&U^0KlK1E`GWW!k9OS~?|<;48z1fd;GSAdRp*DgB0ZqudL#p9eX6Td z1IkW-QPqyb0YOlqAp-3;YEdtgo*Uo|QrY6r3)E7!(D8s=q5WwWS?>wtJhJgnaGw?o zyvIVu%0jkN`F%PX#HxS;0qQ-D>KLY>Her}iaF~7EP0b}5bp4)M&^$~H$XU$3Jw>Q` z#`eqt;iRyzIPp;0%LVYI%CoUp)`-zRBXETPl{sqhRDFsfiB15rPJ`tKUN90rkTR03 zs%+Av9Rhm#EV`(*sn4gfpPpfyjVXRA{OO3QBy_gOf=TVvwE8#5OQ|mWAaE*E`J)3L zD4}Mga~kSCZF$=0Zh=P%d-2}_+S*bkx7=7;xg-fgC*&h(^A$I!Sbyv{)r+%&a$>ho}dQ$Tu1j=CTfO;-#O zUkW!nN0UFqZ3OgMcA(9Y2QaHMdlNFXekh2J(MA1bU&O7vb{0v+wJZ^!qEFJvO;R!4 zvb3eCMLJy0pOiP|m)6W?=jKyEM{SU4bz?r%(}xJT8R0)*`IySf8mpOgbKE{#9j}_M zszd*g2wcR0k%&*6(rD3{vJTKeKF0OSLna#&<4c^+s9OEXe1aO}OH{8M)hc?{fGP*7 zeHgdDU{CDCf)Y`$_KhR!Np6zUHExcm8I=443h(Er5{8;{sBB5mg>m=e5w1H(O~(S_ zP3_k^n&BpHL2%$98X1jx2dT=_a(;Gs@^*G{LJ8``)k$_k(AMetxRTJPOZhIg3gued z5206GYwQ9g?-E!h@D_lwa1(>^CL#KSsA`YSmUe2HQ%^4=D#y_k;WERX(z3jdb$*T~ zrq7hMLT$@7^Dk;P`P=yX*~-m=?+I6(BF515uI{}QifX1a-! zX->@|T^dbwL?6+C^q(nnQ}s#9?YVeRCarJ~MaWI3#;j65+n! zxT^_MH4K3~ExZ`Gf5oGDtF!UNvxPNV>DA2qDOO3D>BG4DR^RNihb*pUMLn3*c)I~6 zr37&+m$F&WdUf?D(h_v4Xl*>&md;{*gf)ImtdqZOAl}a3D9lGys%o<(zrj9fTx**g zHvyAV=2&X5#rXQJ#(8I$NA)O+%ed zw`#8w4n+N!qgX2~e%5S(V?W7C8e#ENWXJlgE;FB|jxVik2iY zmD60xkSrOl3@+nG7z*eCMW2vEkM6dzv!C>kt>boMnvDr6S>KW4Mv1M@Y@*M3*) zUC6WA7Sdcjee5k$&u%qU!F74m!6iaCJ+F&)yGhXAe)}>0?h*3KCF>So@)F%V z#lT5IIOU$AnU{E!aGwW~CjJ}3YrK9=cwKmBeiHD;5sAY2W|PajnC-w!mfgu|^9oy# zh}z9#nPmfcywB24Cq*vit*WkrbsaT-j1N#uj|5CunU1%2L5tBYR7G7i|kdN*VRw# zWVdG0$RdZ66qPl@qk#B2eIuY8205V6Jz}`RT^$$_w zLGP${*oXbSSMT-@`>&6pzy?OBp|0VJ+6`N_xTJOiAXUs5fJ^fGLp&}r5$mZOi+uEj z<)e6}SUh;Y*V*H2QVJbUv@{tjWg2G%7s|xfN|?BkCjRF$Yj>gA=%$6CPCe?}qs>3_ zhyO|3z2<=oohVVmQOj8-kXjP}Cr*`Hw*Wp*nOdFrD7p1zUM`2(_KHy|lmgJ3*r*B) zhQ95BQ`+G%Zc0d=(B)1)s@VnrFH#)QxgpxEh<0y?b}OQ|VFcc0cONwVEkxgP!S!An Zx*YF@+VFXW{{_X&gct(S3c}#7{}-ur>P7$n diff --git a/prismatic/models/backbones/vision/__pycache__/in1k_vit.cpython-310.pyc b/prismatic/models/backbones/vision/__pycache__/in1k_vit.cpython-310.pyc deleted file mode 100644 index e3c5e78cc9310eb48d72e42ab8ab9e27e6c6e060..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 967 zcmZ8g&2Q5%6t|P~V`0_M#3Yb7^td3}&mjX&Edf0HH3N! ziJ=fN%&Zh=R%l_wY~~0Y+V2o+u;vwFO>SM0J)dWpTL8pR3@CvXc2r(_;P=5W@-)J+H* zLOq65qD%A%FK}rs@FTRa7B<6Q9XK=u%&fq}vITE>wr;3u#-(l(umGGQde4w1lhMiO zI3P!dZ^lQ*!P^th(T?J2EF18dG*ESe8?3hoiBhT*Avfsvu|F*`?#oI{d7gZsdE%d^ z)SrAf>>V;XE4lP%QmBlUBJ#6>@l^S1AC$i|#^3mFr1?MZL#S;?O>AKk7Iy#4AO2@< zf2{zt89Tg|5tY-7gD95}K&w(yZxiylqUmbK2D7y-^0J1~ot2u1rzs(F2O2F?$5e)i zG#;CEWl-8^AJ;2^*U&9OSP=n4?@nZYDYAbjvcD7w3yQ<#FqcBY)kE^0Nv8J(Xbo>b Xb^GO#{{e=XGOjpiyXiR2-v^C<7bOG} diff --git a/prismatic/models/backbones/vision/__pycache__/siglip_vit.cpython-310.pyc b/prismatic/models/backbones/vision/__pycache__/siglip_vit.cpython-310.pyc deleted file mode 100644 index 4bbd2cee25bfc89a0c02ec867107b58887beabfa..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1114 zcmaJ=&2G~`5Z<+&G;tG^R;?=j?)Z>6EosC7AzFm0K$U`0_2A3O^=^^{|FP>78s*kX zJ#pg73-rRXaNZLVr@jJkU^aHENQJI7Y;{}NxRjG3gGrU@GWlm+9^#|^DG(w(>U7bG$5Pw(@pDE=O0!;0PFG`LY~NE{f4PA zyBlb7=2x{i$QEoE@-TG40GDZr^d}&RE(`yf-ox} zQpYP}18Xs`{bzFT2F1;*I-o+`KxaXh7W)x{(riK?;yBd%8X+IXepq-+pj??e&St?> zy{IT)VMvHr1)yB%G{l-#6;R6{Qd3_+kMPXWb4qG~5Sj!KqH$elGcUAxU1&2e6b$gs p<$Gah*gINj?75(`@H(ifJk9fehT$Z2G8U>?hNbDyPx<$1^)Ha+H!=VK diff --git a/prismatic/models/vlas/__pycache__/__init__.cpython-310.pyc b/prismatic/models/vlas/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 66e0a147b34745ff54596119417684a399d2a3e3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 199 zcmd1j<>g`k0)s7o_He`8fJ%GTma&2eQj@5{p23S27f_04Xr>%UM4oKQ~pss5CPpH7~t9F)v*o zs7@cKPuDRev7jWiNWY*cvp6@gBr{n*H$Npcrx>K5SU)~KGcU6wK3=b&@)m~;P^L5| N)edBJF~|-91_0oOF!cZc diff --git a/prismatic/models/vlas/__pycache__/openvla.cpython-310.pyc b/prismatic/models/vlas/__pycache__/openvla.cpython-310.pyc deleted file mode 100644 index 663c9b5d9a2441582975da9d97a12656cf834550..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4941 zcmZ`-TaO&Y74F;Idgrn`yI$|E9o%uic#u8jqC~M^#;Yy6RM&>vwvR zTFusQz4_07_dY+VY5!9XiuBhQ&@P zyO%p<+^w(@*`1~AT_LPSwN5P?7sGng=rpo%DO`?LIxE?@95$oX&T2NUglo}>&Iw(k zt91QH@0|RlMr^WlUn5I&<;d)85c55)RXzNNo%CtE7kcgffo;EYa3f)U&v`2$gOECe zc7r&Gx1F~!=hn4X9gjPn69(Hou}@X+>@%<5r_AwKGKdLYzQY0@c_Q#{U3=?F&VX{q zA8?UGPMB;5{$#FA0`4;^f9X)fg0j{2Lq~gbeP;AIyI?vtQBZ+Q|lVX8!>2! zn8fN9|DbhsQXs|q)lQjKM)p{%SY(xy=n}2c0^BwX@Yr zqSGYO)9e4_JKN*C%>0a4(>#uxOz1v*?6&r0wXb8EC2X zcl}SguBCdaxkhTBH;MM1_E5hL(--&vz9!4AtIXrNtcJy`j;>XZ7N=o%PLYl1P3*gg#*65hrJ`=@hszVSw5Laug+RHZTnb_?*dTLiX%FH|YrFJ-oxoyW zX=8e=VNga}LgVYu*C*PJHqyuHZtR$bF)R#=pJ*d%tPM+PX{Ufh0ZomHW9?Ion4iIa z9vIMIX;i|Da#~Ew%9BXxfw@y2Rlvm_>%)pzN{ga;WUyPJHmau;QYPh(4N_4)N$g!? ztn-_qw$n%}=$A2Ph1lErSRXaVNFk*9z!+81isIEZ;8A!UR^mp+broDcj4NV|)WwOk zc%*--eP$f#zt_`3svqe?8?$V1UBSG2sZAQ2+U#6*yp_FO$Gbmfe!NUp?w3=0baJe{ zuBp`-=O#GYbIzG=Hik>!x=B{kB|LADwFl;(w{%UM5~n}Wj*OijkW*vKH!2g1o*((#b5g~xY6oVlhW630(s5yXRJfLxz*J5T7vX-xB0 z+n#OuO5bB%l=Hjnyz|C2CriExCVtb)0OA5NIuX4K&I=!XYt?+)%g&9*4}v2B7am_T z51h-uq&)XK#Df@AII$N&1&|y`hXr&`;mRPss6tI;a`mY46o4b}`4uPVI@gn!UU2*# z^>;Z?#}iH#>$6>_%3ZkcLpH&@Ag07g;_v_+P>+MT!pr<57c$*WP66~2y@b<+y%#l@ zCrIaJmJu?*A``Apw2K$7Iy5F1X}6mbkeVQUdv;`5=nWz-K78tZCCAgg9mr^sAo@a$56|3;k*E6!6Hw9Zso${MjyUL3=cmp?<9d6B2Uwt) zXex)UqGfmh9MC?heP|5zp#dZ?2=eSm%T8&GPRzOjlF=wHtrqI-@OAzmaE%TxLItw z@X(ke6e3v-!)QLGGBby4s@Z)8eF*o?AP9*#b{+MBwCeMQ26S*N? zP>btRb-1~DYEwyxlB2UV@o6Y%B6Mc{>|Jx37H2PcXJs;LUed82tJ5q^=slGD9eeTs z2a%1BPxwDm31($DF|N5=1L6g5qwY*zIq zYN!oejo0+5Vdw_tT3;Jh`K#04Y%HzmHda5OTcEF_SzlYl!}afuUtfjy^ipG0V!ejO zZ=g$2mJeVGfkbnp6MfgC99|O0urA zbKreA{va8!97vqwXaOQX8aNXJH5SmKA5s9A(?@j*EV(@gx!X<`0h;CQZW2P{3d52N z_kb#Z>RzIdMdkH5vN;2e#H(=dG4Puh>#}qB-VLlcw$DK_P~5;Q;7$yL2;!VAa5d>V z-kukPDv>Shb7i`{I^$&r^|=|+SX!uRk;7tms{`(lPmv&A<^)ap`!xR1x&5RsC89CHFu;; zXnuL&^&mp3LS6+H3!r;9Q$s%q1OMRZSw+PAo=@Adn5&fi1RJzW=Hr&j>SFZJ$fN!{ zum!+3tSuA&~d15?3ygf=-( z-8kHs>UrTjS<++6is4E16{ZR<*Oj*GqHMzd1sGRd_wK+8^EXTdo-|R)F;k5S-7xV4 zQ(rdO*AshHQR}lJRk5K=eI82ljcA)GMr znu#K7laD>0h9SNa=h9}0O64e~zSI?+D_kpUhJMa8kZO#tjAH%E;*w!lkKllhkc1u; zCwI$O{>J##hQ85oirMOx z4^uubt24}|jAw}2d2W%^&Obxjch&zF?Fo)&)tQ@-+%8}-gVxTO*|+O-(Owww_Kcdt z;_B?VJ^PD;sXw#i6Z1FfkD*DzpOp@|0aZN$+?)P6lIdeX(eoHDzm$8&RaG5wMYpt# QQrRf$)%<6uJ5s*+e*>0?l>h($ From af9ba8293a306ef404be51e38876449a8a1e251f Mon Sep 17 00:00:00 2001 From: ruiheng123 Date: Mon, 3 Nov 2025 09:18:47 +0800 Subject: [PATCH 5/6] feature(wrh): add pi3 injection --initial version --- .../llm/__pycache__/__init__.cpython-310.pyc | Bin 353 -> 0 bytes .../llm/__pycache__/base_llm.cpython-310.pyc | Bin 7587 -> 0 bytes .../llm/__pycache__/llama2.cpython-310.pyc | Bin 3153 -> 0 bytes .../llm/__pycache__/mistral.cpython-310.pyc | Bin 2491 -> 0 bytes .../llm/__pycache__/phi.cpython-310.pyc | Bin 2265 -> 0 bytes .../llm/__pycache__/qwen25.cpython-310.pyc | Bin 3044 -> 0 bytes .../__pycache__/__init__.cpython-310.pyc | Bin 582 -> 0 bytes .../__pycache__/base_prompter.cpython-310.pyc | Bin 2905 -> 0 bytes .../llama2_chat_prompter.cpython-310.pyc | Bin 2916 -> 0 bytes .../mistral_instruct_prompter.cpython-310.pyc | Bin 2353 -> 0 bytes .../__pycache__/phi_prompter.cpython-310.pyc | Bin 2303 -> 0 bytes .../__pycache__/qwen_prompter.cpython-310.pyc | Bin 2567 -> 0 bytes .../vicuna_v15_prompter.cpython-310.pyc | Bin 2622 -> 0 bytes .../vlms/__pycache__/__init__.cpython-310.pyc | Bin 206 -> 0 bytes .../vlms/__pycache__/base_vlm.cpython-310.pyc | Bin 4698 -> 0 bytes .../vlms/__pycache__/prismatic.cpython-310.pyc | Bin 15520 -> 0 bytes 16 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 prismatic/models/backbones/llm/__pycache__/__init__.cpython-310.pyc delete mode 100644 prismatic/models/backbones/llm/__pycache__/base_llm.cpython-310.pyc delete mode 100644 prismatic/models/backbones/llm/__pycache__/llama2.cpython-310.pyc delete mode 100644 prismatic/models/backbones/llm/__pycache__/mistral.cpython-310.pyc delete mode 100644 prismatic/models/backbones/llm/__pycache__/phi.cpython-310.pyc delete mode 100644 prismatic/models/backbones/llm/__pycache__/qwen25.cpython-310.pyc delete mode 100644 prismatic/models/backbones/llm/prompting/__pycache__/__init__.cpython-310.pyc delete mode 100644 prismatic/models/backbones/llm/prompting/__pycache__/base_prompter.cpython-310.pyc delete mode 100644 prismatic/models/backbones/llm/prompting/__pycache__/llama2_chat_prompter.cpython-310.pyc delete mode 100644 prismatic/models/backbones/llm/prompting/__pycache__/mistral_instruct_prompter.cpython-310.pyc delete mode 100644 prismatic/models/backbones/llm/prompting/__pycache__/phi_prompter.cpython-310.pyc delete mode 100644 prismatic/models/backbones/llm/prompting/__pycache__/qwen_prompter.cpython-310.pyc delete mode 100644 prismatic/models/backbones/llm/prompting/__pycache__/vicuna_v15_prompter.cpython-310.pyc delete mode 100644 prismatic/models/vlms/__pycache__/__init__.cpython-310.pyc delete mode 100644 prismatic/models/vlms/__pycache__/base_vlm.cpython-310.pyc delete mode 100644 prismatic/models/vlms/__pycache__/prismatic.cpython-310.pyc diff --git a/prismatic/models/backbones/llm/__pycache__/__init__.cpython-310.pyc b/prismatic/models/backbones/llm/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 328b2ce43dd80d11c1ab32db9b20b917acee6762..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 353 zcmY+9y-ve06ou{l11X}sMFy}`F(O16k%N$s7_vl8V>K4nNo1!W-h$U);aQlocH$M7 zxK0_`EBV~-9Qo(Eo6U|1*8SIa^MQTq%zk-jZgJTwl$z8Oh^7Uljy>?SU-%RI5a_T7 zCk`OeaS|g{Orl*gM+Djg?q(~}Z zWeK2-n{bQ(gcb|qI}_{OB_?pr?;B-8q)%dwuubyIS<`9B%tke`UB8KT&0ayUXTH2x z)}kMzVZBjSi$SfJ?rI5^RsX|VhVQbnRIM7y^0I#)lWW|v0eR!TY7>-{q%@^x^bcbb)T=@ot^!_HwX{_H6Updn~)2OjN&9tWPuQ2(SaaFV64=t&T-ilyN9cYMEF@10uK)?pM2w;9__-;E-h@s<~Ou@?q**s@*Qcf}F6H{5<43VtMb6nQ7S z26AxJchS=g8Tai&w|RUR1{k=z`*!W7{m6^kcB}9Ewi^Ut?9%rl`k-WDb$Q$lS#;Cx z$DZ%Srx+SE<(Qk8z-%dj>w5z^(e}e5uUWUO_qgr)QJBie?YPbDBOY+!HSM={!1(S+ zigYP$F8EEmC&CktdBG7DgT=XhKgRcCE?RDr+pSRG^KRN7oZ>sy7MA)pt!gI}+ipK{ z{oS|i4PHO0-?Vpk-L5-9uhs4QK`MmnW7*J$dz?xJW%j#0smQqP##q;>eFWVF_{iI5 z3iG->pLcl>M;5K;Lgrf!?%NyCSpTSdK_? zB70Jt&_3q8Cr6VkZK&KQ1<5a9Y!j;|8?goVu>&XEKec`Ds2xAzRHl0rhZqkPdsrIw znS>8KXcQVuch%$ID#AYYlXxdaj5)*RSOga~lG(~o=o_&pql2W}Ljr}99AFpIl=&u3hKFh4eS?t~{?JaXe^ zJIQ-T0S+Ac%_iw26sJwn$mDM+YeY{}Md1p3meSCe%5|nOy=DAdX~Ac)+>>&{U>eW0 zG?r(DCq^UB3uv2Yo2>Z6fDb8x@?B=Jl5APLP}HCQV>eitmw35TS%$}vwJK`WF>MO9 zDfCY-E1*wHI_hj@JYp8L8Bpf3_k8wV$le;CZ7s4zcIinDQq4iCGVz17|14PS>H<5SMIE0&K34bMqkV5SNLd-&DOKN zSNHnR`;5eMYUK3t; z{G&QOI-bZz5i5_CPu0&9RT-*7#nG68QkSK{lzruyx(BPzMg1PE(sUd$Aje58kKu%Q zEe}*3oReZavRxb_BjSEbEa3Cc)THP*(nUJXGv#-?o9(d6H$~rT^Wf+a9Q)>p?{5BN zck9*`b9?ZVn?2!0U7XS8rgZnw=4esTCK#bLzE8w5ID8jRWTLRtyqec7bud4nmikEI zCd_vHqdGm?c%nRtq0+%x#U)g0s#wMAnJU)s685-QM=hxw@z@zJwGL85>ddQyqz z8}&lgzcMhDlq?yitgeD|p6L>Ot`)JK^dCJqvdo9Piow4k{)RfZexCnD6J98axK4a& z{Vx_{jR)lToFjoN47=|%+q`+)3vt0gq(7m#v>Aiiq$J-sCnGN~|d zx^8rwlzU+$p@Hn3>cOKq$Nbbeas57zl8T%eIlOzwF*^^R%>5`S_F)}O7m_72p>&7N z_`6XupOM-ggZVlUumV0+N)L*|yk^`iVFjZf)yHN+dlf4`Q)}wr@ZElZ3m%UBkt>)D z(IQ+)xP5SC($$3ZXSZ)mfF_RYjZ{2*1aGWVei!5MFYNg zZZC}Qk1lM^?g+^Cl_)foUWTzt-9 zdhPP%B|dfYw%ci_YOK9AlT%Wz{-hHsehLQx_L zvdCd~)DGf|)M@Kt1=J+pOwm+nbixTtUF1@WskJe$`#$E$6JTkBg-O}QBVH^CT|22H zBRMpAdBAzOEWIY+y)F_!ypE~Ph_$H=mL^$G&c?}F#JIg@*53G6yG~be;C4A(#ZIPV z)XR?ZVc+$$9vv80-jSk;FNkYY(8Vt(5Eb8{;szB36iE&-7qFnU8wN)Yc@Tvn$)|5Y zH)K-ULFf`^J;q@e`eKSE-=X3)8e!rn0VTwzqDZu9D*lu@v>-_GfUz(H@n=NHNxKw( zP9sQ%;xCA*<027%MNLvpQps>bll#8oOgOMBG=w~hm`6d&h)z(L%A#5|HGNvuv@#w| z*VOZW#&1kb`^wa{-V%p$rrUR^Op8SzNGy|E0=$*O)t@R z)J0jQ_4#ofeDoz%%FqP+ul?vErC)>qevFojYEdJk%hs6z+?>Oc=LXLKqZUv%d7c3q za~*Aqn~1$lR%TjjR$^<4x+{$$FscQ7Nb$DOC;=zVuvyf~K)IDp6}^?t6tL?&TL5($ z{GR(`*D$s~qrqoU)}~OqG^Q<~HiQ0IiL93;9d&kjJYoek6O`5Ly_UVNWbZPc1G0XF z*-vtfd5KpSfLE_4b8qcrp~{&X`W*YF?CnUfm{^2iTW;6$Pm==G;X9L3yXCa|hfdFp z+ewwEUH2o$2Yf`^M5lyfN!1Hv*w>UkhbIe!1X^$~?K1@6+?HplN%lGV(IrndNCzY8 zV7pR8JXAkZfr&N3#KRn6;$Z=$f#YsOx8qzV&(xvz_X^X721@;t9ECrWDjDjX!q6Dz z<;dF53{-?d2!`-yJ_Xe=gBx;w@#6V$F`dt>r`pHG*c$4r^jH)7L;cgzu=p9FbTi1K zRUWk@r=PMsIF(U1I~AuYX~)$1bc)dZgv6Ck^pULbwB)69^kJS=+0;|4&zWf>p!x^( zV+*rqht`QAenB7=tr?hGnaw`cI&*ApSyDRl$C~)3tS|pupC0|oj}%i=_y<4|$-8mdgfB1+8w_m&UJ?FK>^xx4S|+j2A3tp`3&6Fe@H`f7$3^uy+H7A{X<2&qnj4*!ZrrVljDRP%o@jKR%|l=7ko{5fn$ijqnq$OQ#u$rYBW z$9vO@aF1cJ$Y`d%?UXzaz~6ZJHzNDs;t<YdZS1;1YHwO0dY&;h`UZ|TPGHGY%S)%xP(sp_PB?u=^ z83~zSupv>pjdBMGcm4;SR8nGF&(f#JVG zH1PX8Fuw-OuL1Mtf%!FHe&FLTHLLnXwPBfy26|p9W|;l8ASr-#3y#CWCPFe3Iii3J zIMLn1rl2z*gRrD1{||*YDw2LI$>Z+8xKu3!2~vF#Hcb^t3Q9~FrqzQ$P(~wJNt+D+ ziV)smKjy4XdA~Y&Rp}X%@`N{xJ}-`7r*o;^f&Z*U;7gOgeq;_b7o0`GSy~@IeX^JIWu!^-!g4B z>jpfZ{`K#p-!={7A2?ZjYB2d2hWZ;2Zg>VaIf=}ecqFzwE4Dq`M41&io&#??a$?tW zwakesan-A8-i>N;-K%T960O7yuc7&B)QnqRD_-?h<27$BUia2PUK3Sb=PNHP%{As+ zbI!Hq-0GZL)7<(C*K7a6;BCI~+~6Cc@yzlzxb?{Bb@E@;qlm?9cYAzNuiuN9Qe7^B zFbT6TO}arUyDW;jgTX$^Wq2%jH;`%E9iY{NRNi9~#iGIfcKsEE!{}K>^|LYBvZ$y& z5?@V3;)}xm{A4T+dS*eA1nAnE)-TOsUCZtZKZSD!>_kXV-1y~#=>dcNyUhP`m?olU z7ncVEw$FC&jac>&E;G*VPQnPxist$0`ooD7=S9~ZhyEmCk9Te@Tf1c_@++~(*p{C3 z^{uvcw>}(hjRNov!WGki#bI<(RFL;0RjiBxI+_gWm}R5<>IK$2AE0^u!wZ&RV;}s$ zgi=k_zxTbW85M@=nX zZ7o~ZvbLD%Zt#wlZ+vaohS!10zXS!RiwH`4R#~r%s$m>F_`HBf#dJ8WHyZK+A{En^ z{hCH1Im|{y6%wAMUy7t?g-IYJ7CDWfa*NH05;TB58c~*IBEeRmyM-%7Hj#<%PD6w* zK;{S`L1R))OrTM!=2v|fDQxPbq{*B+=MgaB_x-TTme$l#*CM)@#EG|o-z^>G!A|49P!{ktbmPmSTVJi^@vV#}Dzz+bB z8iig2{G#-warS9{l*XbjC*cS(_LL=u{o{!B9}o7n_Bb17LiWcpR58m!zmJKFRDYIm z)rWh*n8KpG4rl%phPnpCtrC;GCIl$_?C)IHJzJ}{%xh+TZKl~)Anl%m$P)h%=K=hoJG!&>=sr0!pY$q4J>*!|K?q5% ztBlD^Jq@ubM8Y*E8{nD5uc$;wDC&<{G!dUjnaY=@yn*s}=j1)qSRZQ`r^6D*X^^WV;k?jJS?AhCRc6Fb^cQp4>t5#$`4$$r1(L$3oHwzeEV~ zUx20}Kx~rVSiZ!CHvjt~*!vkYJ6nOQjDv8U|6yvYp!(dfpka zD4=5rJ*yD6cm|(sI&<8$@*gidev!$N-w1I!0oY4eqDSTtnZ7L!@TPe?!2cxV;MHq* zdX9VtlnY09XVK7zFvAg7r$xaRIHn`P_=Vp50MHoJ2J`=r7#KDBc0&uWS5&vOp;2V^FHDS?o#Zs9ptT4$WR!?CM(=^hUQdFUG z0Zp?LiFGMgknAFHP|@Aj`^gVb@)MLeIv?^j3NWJb4oaI--19{gQMypfH&F%KOSXZ0 z2t(mERfE`5^g06^`wy|mzg?UBi#X&v;*v8`Y5kL2>X@W)$s|_WB=&Vv@6hWM6^dmA zF1k<|P>9;4@-Es1+`qhiW3enHmb-UzQM9eosZyd!u;Ja91N2OH4$xo~ zZx0!KX@QaWhrGD?J=!Hm;FUEl=~!!+LWGiUBf%&wgioV_MyKkXGPNHej;Qy6*hU9F Y1PxRF$Q9FV*som&hWp0*+s@nn1?QA&*#H0l diff --git a/prismatic/models/backbones/llm/__pycache__/mistral.cpython-310.pyc b/prismatic/models/backbones/llm/__pycache__/mistral.cpython-310.pyc deleted file mode 100644 index 87082ed25d728505464669cafaf6044c0333958b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2491 zcmai0OOG2x5bo}I`0e#R64($V69I_@YbOyR9HJ<@c|gG4tk@`rxpcT9|z-5U#?Cp8#RVK@2k@a6lXq7> zW^AKYW6~`Lew;{Gv`2g8@-^Ycag%Y+5BbZe0_EbUh0}-pr(4)g~pN`yQI5{)deHvBv9x_?Utx|dI zQh#j(0Zrb0Q|sr48R!hoKDAtke*}gj2SW#u5-yRFQ8L*wv$(-*Zn6RgZWXa2x1sVj z^Z@aykCBI2g;gKboB}VhGB5G+D)I=cvHBzP06UdjTivat`UTcV_4QQ0u-gFJMYfdc z79S%MIZMzrE5~(&wwPAtIVvKwdGn51(Anze^`bsgb*%{8@2TM7PBK(Q$YK)R=b@_m zp~oc;2b=~_d39nObLv444yl_YJk(7@FDhH|WGushJ(C#C7hoB%Vh2cq4$v;%#~$7% z`^LVxZ!!GTOvpZ7MZe)E7^8i(YdkX z*WPTIavAoJE1I0pE7)T4iO$;}NuI zkx~sWN>#mgYx~Z(>s{Kref>svTb0tZ_!BT%av7uXh!1>MWLw0ldc2)Zs{v_M_8?bPqn9+8es69KAJaSU4s>01Og>P&oG0c%3?yc zr!oP#k-(TsOlD}CpAj>Ef#UvvTh`{p%AixwbIA!Q^FvHVEu&mEJNwpbjc$OBCAI{0mTaIrfNhc~zlFn8FoqUKlr3Q{{Yw|`UC*T$cV>oHR zB^7!G>zTI!mN5ZhlF7L_QOtGv?@09S3E*@a$@9R%U124W9KcZMdm?g^3!l%8ZDr^R zkB%h3A=Wd2C2+0WoQ7tT$=Nxf&GP{uFM=D)sn)UvV(11OfLNw_0F42tR8=YcZtRNu z7XvOCW%ywtbrO`>k0Oz_v?@XY0M5yt)H6uxiph>9I@`+brSFp$v_?0nveKaCXPU3& zXd7@LD4nUq$J$&shHL=&09LH?R7AwgA#()a`G*+fZ`&k)5sSPeHaQ}N`U`S$1(VWA zOpH^|_u$l8QUyv`G=M|dkT94B=^?Usbxc{P)tMT`QvY3XKAlyyGlaK;{EtE^!pmr` zka`ZQv&R!*YJWTt+AtB?eHUgsFcPi_q|W`1b=KTEVvVCLxhbsP*P=e%Ow!UuPSI4n goR#s4Zv9wK9uqZS=2UU|AuHG}n6GRb){B+;zbBorw*UYD diff --git a/prismatic/models/backbones/llm/__pycache__/phi.cpython-310.pyc b/prismatic/models/backbones/llm/__pycache__/phi.cpython-310.pyc deleted file mode 100644 index f98cb2595e9ea9dc68bbf1dd52bb116df5aa9ce4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2265 zcmah~OOG2x5bo|7k7s86+D8&4fMh{XVxhH3B5^=bly?H!-K?}x4s+>@-978s^gQTk zC#>ZIVk2^bKR}Rz#APpB_#>SA$|*mA4Fsyk-j$OqVyrG#)l_%a>#ND?bsxd=<*&aF zf2$(&hY{wV3J6zW72g11C_oG|B5*@lBL*#fIQw_+SL?;uuZji-n;_`;qQw3xMpI?eG- zKOe^3eAoAHh_EQyj7M=At2j&BQ6}4=5bd?Kb)%Q@W6s)<%#!xzFutA18{w!3#oBt; ze+jXm4$!VnJy{^9@j@^71uU=fe)Ylep<7N^MWL9L6>Xk1#miC26XcQ)~ z*wt0T4@99Ch7lc(`ZN#Ka9{$Mf9f-#_zoD30t}@h9`=Z5c{W>S6>c$y+pNlgJB^sj z9p=IE8HsAoP=rn5pVosaccBa(_fMmUlm)Q&aIi4b)^_TpzR6lojlNOpn>#JAU1Uq8 zZt)qiQLqHXS}D^A)i_MfOXMQ7ar?e*nwa{tl+l>!hEXKp*HrLyTMe}fv8wD5Pjw?s zBQAM5;532c>l34b(+EmGq@hwgHT9zxwIjJ2$#me%CB^X2(#V2U1V9wpLpykaBRnA! zYhq6-3_rIOnc&mt7yJXpXo7aE7d9gdTJYzYyi)?WgDj2WZGB4e z0`^TUDNp*Gu{hl>z>23GN81G#kz6!>9}@-uz@yNGfH+F`(ZQYGFiUt(j^ZH{>`9nz z_a2L|_h9YX>NOVTipyRu<01)FJm{IaiK2H{@1h5OnN|lB%lF{I4`3B%fH*F}U6f483Ka^E^i<9ar@x`e%P_KA2ZW{Q4Hnq3l&OL zJc-p%dT_P&AB1AWZ%LWSeJsy{N}f9+=Z(c;UQP+xpN&G zzq8?;@J$NJ@J${Rbml(Zkmf|WYRFq2oKP}M3kWaak_t1{%%u4oEDHj}CgZbjd2sB8 z|MtS%tN?RcNcMn*=~z*j900)kk;p=I;j?4?S{a5SqPc|TE)2ezg6Vwa&@G3IKYGip z*H7ZMMk zRJ+p55^27evTKORvUb+X^W>*SbHQjTrNh#cQ`cv>8E_#eovXmd#@sZ5Yyx>7R$)$X z5wT|gH~=WTCKmbIvB_VgLS7Mv9FS_`PjYev6Xzr*))`#x?kwsmr7RmjP!1#ukX;5b z_1E|`b+tBEze1XCdvJDM)tzFs3y7Gqa1aGSnmMY^9bx*=?g-Oez;w4C!fXac!k9<; z{Qp=_o7s43@G?!R$wC>>T%O9FEdk?Ro>uOP>GE=7*r)|VqlU{LX+csPZ2x1wUh)0~ DtPf&q diff --git a/prismatic/models/backbones/llm/__pycache__/qwen25.cpython-310.pyc b/prismatic/models/backbones/llm/__pycache__/qwen25.cpython-310.pyc deleted file mode 100644 index 56835870c0f793b0950731dced1f172392a7cacb..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3044 zcma)8&2JmW72lct;PO+VEIIx_4F(13u!TrVu41%sh4>>*<4ST31ud{ZF<5eDMaug@ z%`TH@!CWc>IkiBqtqrt55A~&gOV4x7$=BwRI3MnNv!p^xa+4+I?aZ6^we#Nl4U>Ak zV!(6%-~SnWTQ!W&a54Y5uy_YX`Y#~d@C;5k4M{{j8kwFMS)N6ZXNIDc;YjSX ziCVrj&XVnfi*Qc&8Bt9%w|-ajVM}iLheCqzxnJL#n%~;K?e`z`l2`z{{#)=r%5IN` zAqdZuUt#7Y{Z*ZfTpR9c>z$3&Yn_eT5Mm_^BNq74Fg#QxocF_2RR;kZjC*Y4XM+=J zYoL}dppj!s*H1ET+-}=>Q@aH}Y@@d-KF*|{FV3?@V=0a~Ybc~Sjj7D_6YCqrx{oJ2 z|13-wy4Zx_5hyrXCS_VS%U0RuZ*WJLyd*5{3R{>Lv1lA#fl&osjnkm1Zf(E5-_ScvzNB{+^iFMm5v0%YWzB19-jeuEdWE0Y zJLf(%EW=xdV!NPbYfX3Om_kkqvNooOe(VJ$P4Y_I{Bl16Hp@{dgfpQ^7 zS@KB4sxgcMA+fkv1T~{p#;ITdl<$D~StepELAI_I<8j1vA_~1!IZ|X}8TU(b2a3&$ zYZFHLS0I^jWbBg(3CM&_%!xIDmVIJnbV4o~|0I7V#F#joKC^Z^Ex80S%5zATk*pxW6YL|IJrpK3j9nP%1t5Fi^MJOANA92ur87O((BLtm5RdZ!f^-2=N6C*it8EKk$5*gBl2hP`mSOmMxvS{jn=GJ zz%>iv_kjI5X56Crk579(CI8wGWEcHiL*gJ=LvmcOh5Hr2;5dbAEE1B1c(EYZ{{|(m z1Ie$R4&Y3U*9MWI>kQChw+IDV$IcSj2e9e|Vd7_NZ=V_1^1u%RHj<)0OcASi23O<* zu%(;9q4`fvM|PIVslX7#FhS`RAOIi`+CH7WO%4F2=T?CCKizr$5}vj#uY$0$b=#_% zuE<$cJNnd8xF{nw5I#TiYc4|tX4CV9KMw%0u?FUGC{m3-nC~lQzIytj{*NZawFr8% z`=PypEgAcfU`$mQ0zcS?d5y7yu^&!%%rukAgcg+OR9U?w33Wp#7itR{J3EwEyt0gB z9mxg?(l}PmwzesMjtpDpMBYFNJXQHCp5L?t0J*UNIY5Xl7GR3$rNqlmT|I{Hnd~uqcL2RQJs+-E zm`J=X)!W}AU9>T{DD;Y@kowTQAFhn?L6_hkK2U~#-iCY=2_DT{uj>S8ym4<9WwD7s bmA(bUGL{{e)(HGGbxPJt$2u|oyL|qCVXHa} diff --git a/prismatic/models/backbones/llm/prompting/__pycache__/__init__.cpython-310.pyc b/prismatic/models/backbones/llm/prompting/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 1290d8ce8cf7a06076947df35acd953194be844e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 582 zcmZvZ!A^rf6h#M0TUsG$qlwY@11yxZNq5FrU1?gyn6Q`)nZl$@oBCjxloqJFs6loj@4QoQjx~6)a zw}gR>TDN#xba1EE2JZ?Jo3(E9p6KI#tvh^ASl9yHeMFA=3ExDHUfG^ZMOIvuj7L;f zL$8!{%W|~pa5f`zGQN&Uv5QXTEH5PCHwkQI_=gX?nC;FS`z$OI;*Tc(y>a-krpeaG zonvErL2?>obyg~+`Z=-%1s`xu1Q`b*glo8=(2>|=3)oM%hPM^BvY0hiM_K!;I$E$A zO&Zcw#D0cYjc0=QiewMAx_tR=oC@m7lEpMxu1T_VSDd*1Y>Ygv-*KnG@+H~VJZd%yR!S)<_*7*GHDd;8rS zA^#vTJuDy|!qh*&B1ne_Dp*Vt)?o?naEdgKt;FuwlsqBA680exwqysq;{ff58qhWA z09_OA3(}e!9(vtS%OF=-k{42~=6jyE(ba_tqr#8kP-|bvUO(*@{Vesfo*(+jATIj% zih)Y~>YCpj^kX5F-^-LQQ$0}qbjy#jRQGiu(*n3e=X1h4v<_$oP!aQZ3fz{ci-tLM;S~9pXo>}MGAAhDwp2Er z8^j(}mT#8zBoi_YdSTL!_sWLe1Iv?BSW8Dqge&5p<#^zp!F&u;FTygSBNC7igB8qp zK~Csqizx@bC^YkgmzI`sZ|a1oI(+!|*YSt@lqjuZHd-Us)<`D`$hZV;gJg8)Toj6jEae7G`W{_2>i zv{V+$vX)4#!!3EiU<&uq;1Bo^8mS=8D>GVL&EtLn7sSIDS1k=7OE(lEKp3w8^#P%? z=y^THETucpQU!_LijZ+Ua6eW-Edenlo>k(P@f2;HlDxbniy+T1l>1?P>eNq==PE42 zTeDbP#vGy4@xc8@v^b%a7UvZ3%$vqZ*Whv`_Sz+44T~|V<)Q1+Fp)t}dO-kwAH=w? z2f^=yFh2dlA=4E)Lp5>1pj3#jGUk1XyNNDlhrUoYEa)v0xGuR#nRAUYwrr;P8o}3@ zfia9j_h30o)rE_x`iAtVN!LclzARZgPL;kUQRhYd(1K!CC22!6fpXsvi*)87O&80h z=L02YfZCiSka9tl3o(jlAP)KwECo3rM-)<%9W$)2zp+0_fRhU*_#uIvWfZo6a)Gji zZFn5m*D%6c4y5sruj%!nwbs__7eWDLy+C;guYe-(mI(Ha!%e<9w0=w<_(RA0c~Cs? zjVn%Q$sw+%+)`!7S{x=_5w3rUPhp_f$?)z4>(4TEbuG@KFxKm^sV-hn3?IP&*Ee7p zfoi*4IU|xn(|HSq?|`E3m{GGNCH0|DbD3%NIdH2LTqg1I1(09CG>YATQm=euz$|FF zrL70KR3-vT-i0S_!FlPuQehr!50Wq~U9;cHi_*zWjF%o(f*{HUX`!%d0xWYuU>a5s zRN_sfVpyX_8kw53&OP3wi+p(F&`fM#CC1ozjIX_hWdslix&q$XWvVfP-n+~6 z)u-e&#EY=b_Uy=n^D#SO$7CPM`_!#FBXYzcw&*?|@qH_;0c{!D9@+cW2n6ofBkMkN zioiA$^lw9vkpcmNg~wIE*BqUZ{Sq z&@JjVE?>jaa?VxZvSD5gPAht60X>mRadt2d;>N_epfr`_4!o+ds@|rHw8=aQ{exMw z$(!7|)D7brK2y2JNol~p3$GOjKRuub=n)%H10fsn`v@@uP|1#UOpom2agzbdK7)J4 zN@H|cS2D?7%3Q(!%^wQ$1{$>irDatpU>oYpR}lxXmhY)=@d*>xliU-Qxl5V0LmhdMqP3B!; z!ukNzC?#vT>Srk&I(Vy$LuiykU~?ChTyUBy@>SR~lEA zSLPI+Z9)O41%b#SXihxrM#@Aq!caQJUJidHRfEEwV{S}|IJ2{K*QyS;j<~ZqQT3Li@~q)<_|y=8#2X}i1}O$1t;E#ow+-7!MpgY{Z$NW%F~{zYj5JJ zhH5@@hxOkx<*Th{Ol@iRsWWV-)*4EVx^6oq(+X2kCLf;@F4sk?C!~Xv#?n2%VoDakV#s|d||YVCbF1Cxy1kNG%KQc z7~j4F0mWvzm}!}X1@2A!0&?ET$`GR6j^?^-=8g}Q)o5`$GSI8`=dCW_AB$=nx#*=McS>b?Apohp8o=XvJNjB zA40n{31jf>D<&`h@hDx$5XOXZrsHg~h-Dlm(?vMN6!bfea-5W5o=15RCPhzf4vb=G zoXSN4=x9yBY)uCiVJydKQs{(`0D3eJyN!f?$C1+c9V@#Mdb#=}DrV5*I5d$iPGkyr z!~BRKBYz}eu`rN)p$6jxU8p{PLAWSZ09F=q33Ilm!8o$my9`Kv3`hm z@ZQ6l?|>-2VrT4}ea)2k#zEvb2g~~Y{^6s;fxq1H_xJn64tyT~*7CwcS%(`B2cI+3 z#OFVw{w$sAzF9;wolKtq!9H~9fAZk{?)xgF!0T^R?ax!CT zulU6Kl1;cmW<6^ozv@_R;KlmHQ%&U~$GmC=dtJ4qT9kEmU8uI&#;EZnbJ?&7^mfXd zHhcCccdo?SLD{m_2a|9f#U~a+Ln@BR9~TetR2Ar(e*sdkr|gWc_)46kCKO^N6k9O! zTH$=f9=o?#f|aNID;Crg|4iWP;jJs7oM#LiG=ps}Mh%Rbs%H22;I|ZmzTQDcES>#) zuyptP`QSTa<&bn~fl#bmiIY$NcyRdf=W^MUpY|)(#>8`8I`e#L`HD~(EF1eMNFx;v z?vOz8T#GIDz6Y*e5JX8-1VL{gO_1AoZ5D5H!FRd$zgEq6w)u{;tZMkHQMiG%bWIxT zvOd>&j*53B2q`Kl2=9^jzk#fPq2f1CB94SSI~de`m|SgoyUSgm7RL(ub6Z{?l3D=75XTeiM?(uEzgZjed+s?M*{(v(tgpA4bi>C%=?YJ3zbFm62r1(^gKP*nZfUkKbu?7 z0_~`3AGHe93ePG55c=7Pc?Y{phX^&6%Zw~{ryBhrrLKyf29z-mAKLZ`0L#=<2;qSI2y=H|^XI!sZPkR&naw_NJFqvr3XOXc?8!1Ik-l T>0H)owuGpOn)mPP+q>?!kA>j6 diff --git a/prismatic/models/backbones/llm/prompting/__pycache__/mistral_instruct_prompter.cpython-310.pyc b/prismatic/models/backbones/llm/prompting/__pycache__/mistral_instruct_prompter.cpython-310.pyc deleted file mode 100644 index c150796054096edbb6624e73544bce1f71ad0ab1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2353 zcma)8&2Jk;6rYdv+S%A{XhWc-NGnyexWeA{LZm9PXp4YW5D-NiQqXF%GqyL`FJ{&) zu{M`BsQeFArR3NPe+%cma_W^MC*GU2(}Y$nE6wwJZ)V@T_j@1ftkZD_Ja_;2bMjk@ zkiSqk`B)%)21{Q9BFGjIRIr#PY>QFkdCU`Q%cA5?_^k~)kHZMRO#K3n>MHk|UlTULl=n*iIY*9d(i8F#pF1WNLKac3rk^O{>r~qsa zI)HE!c3ZGue`Jd#;T%|7cCCT4mRJ^T&@|0H6CKe7sr7_dWNQg>vr=80N41FPl#iQL zJIREM{ZWua@m|%@d%BRxJbRU`WHD7~$WMfUJk|^T>RUhx@|Yabl9ueqRLf87cjDvX zf(d>=fU`_Q7Dz3S8lqv&*uXah0Wa;%9-HwEJ)Bt^o*sr2%q||}@veKYbQ_4EC3#8& zd&c2SzBy~&zq5Joho9Zql6&8)+2`art$3ox3aXtL^3T|tWQp|DG@8hC{4hwz-h(*s ze*F4o|E37ARJ~k98gd(jo&iOB+d;Uyok8V#ah$+5mNZn(qGVoM?Nw_d4w7vV4BtbK z8IU%at-mP?10O6!h3^kG;w%heJp`t<@d^}Z?;IMw0Hg#kcDb@kl0w5i1*mrbnrF-a zQ;~|gP?cO}S}nn8z{Jur9@U^MgGM2q@53@~ULZZDI;cj>d#$RWr@2&BYvEqSx53mI z*sPp~D#-oGGzrqGW%zMkRCWzfu67j+$!d94~s;Oii!_0!T6e`Nq zyZHVZ5-hZyUE}-#DlP)4I>xF$N2EG~S<0n2RxHDH3*lv-m}FF~KnNd`_#s%*t3WQ( z720JEwW-4_+T~qty)8Ej8zDxzcow6>B;ym%7MKqDm?FT3tfU4~R`PX(oB_yW$2y{i zjU$)<&*^5*t~kVRVm1&|Z6%ZJfy@ox`wFy<3@2N|0>c26s zjwO7yvO=Z>d^_eN@H!mSI7A%guohimv+HlH$Jg0-rH65xnfT8FgJlKdlk7k-9de9P zT>&-tj%igrBp8&sip2EG@ldOi<(Ny&aJm6!OfR2hvny|R)Vxz)=U|(sK_Y#>a(o{K z;50_Q?fbt>gSg&NAEOVZaIsz0Wh9tH)dx~_=6xN?IELPS>E?MnV!9mD--TtmyhU8P z);+trwxm#xxo!Hs$U?Z+F7|R5wdS(w)8olJs3-H_crp)QG7pX?Gw8KE{_mj>Xk+?> ZFUBkM1qO8;n5 z$EUU5>uvfGd~m}F5^7Yeb4(*j---VB-i~(Sr&SIFqlloWAJPgMzSB} zgLMAD00QjY{OD2Gg82dmut7JJXI#*mf`Di4R+qi%Y*dh5NTB?+tTt5( zOh**N{6ifd-G-stAOx+*b0XO998B=7S<{kd-JdOaHG$Us7vu#sJk?_bR3}!v8V)9T zDhFyBPh>WJ9A@LeFG)D~>H9mqJ0e8V4hj`(NNgMpEIN9y6Gpo`IdFK8Bq?koxq%wU zgp*V^t&Jp1cSN{(1)XFd+GKX)P1O5A5NB~21pSR9kHSQ6f>Pf*GvbSA>>7v)V&3J- ztw;t%39<8M#%08} z+cFJ3Eu=E7xsQhLfT{DaX}rfOEP}~24KvfS`nV{KTf<>Iq)HIw)2!@rg+8kW3Ks-; zu@nSR5^5a;>H;oXSKa|_6^2GvmucJaoMpPoXIK6gGUKQ`k*1kSt;4ZAMNKR!q~@1s z@edFcpd;uSaAcRMP6f?tm+6c5$ZtSb;Vg7^XiMyf9kL^`&+IL{(F2V?2P=mTkedR| zY@b(r-^tcaS>BkR*l$z~*lohRfj9Qu%H4M=aO$C3IkyNS0pt#5L^SP;R1R>rwe=FI zVl!YGHNG66oi#V&6oTBe33_Q0)Kr){(^RrZpfN$ZN);Ds4e!5;;xY=PU$lhA5*uX6g(7P-$U8hC?0Wjxkssc%iJ#`&c-VEu|Se8MNmoh8i|Khl`Z^1#0?Z{&u zYta^)eexDP&${!_XnxNhef=AOZsdv<8_^xFk)};CrMGr+Egy|d?ZzPih z3i!2aSMi7~Wz5_P3|q=A;?uR23zyfH6s~V|UDFH#kw@^ZTkaLmJMClUi$$yL*R8g{ tXtjN4wf#k_g>@|#y;k8Lfi-=?7wZ-J0&TdCauZ9&rp_Vlr8c|7{{x+(LU#ZF diff --git a/prismatic/models/backbones/llm/prompting/__pycache__/qwen_prompter.cpython-310.pyc b/prismatic/models/backbones/llm/prompting/__pycache__/qwen_prompter.cpython-310.pyc deleted file mode 100644 index 52951bce3189acc0bb2747a4f5fe6d766dc2f8be..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2567 zcmZ`*-ESL35Z{mU`Es${v`y2d1h}G7VNvV~p;kyypcDkEz%2wJ)gYZN-gR=#eRc0l zh;?}(MeY9pk&?$qh`)vBedVeD0uR8yZgtVTgUT+ z{Ds2F&jR6N82L>gf(ROsnD!|}o<%I?eGWW_KkGNvw>~GLChW&V*plz_eofTZN!xh_ zYe}1G=U!HXX%a-hH}9)7&WcY)VI-s)fBGODd4ZDN*E=$~?hTX-3MsseUGH`jZUh^F zcPC0mqT@~N0&h!3*>DtjL7sjHCX*kdV1TEe{OS`S9|9vu zp9<1v;*4Mt{E(L9%S2d$4(%soNR2;D^aN%d7%)Gy#hh>-TYbCIfUPCwMFTW-GtWd* zv_Nv65R3HZAO;KiEPD2W(>>jYQz0XN7{p<;tDE_5UdY(bOk}jJWHD07fS?*aBw!y5KqA8=g*MpF=QYOXhKtd?QyL->YiufZ_) z^~k9kU>=zptN>F20^3~KB}u4Y1_?xkA8>QJCYpNY>^4 z-)9D2M_mC^T~{)Lwks7@iix=Lkb51;8%W*+(zYvDuA<^?Ai8OG_a`ybXAsm(ikV^_ zj-3isj?WTZJAn{BVvaX3J>?o*po@HowwOz8>N1PAc#B&vg>kxgfUZuzX`eN?d7v#2 zIl4~~i$mzJCZ??9w-6n+PL64>%{7Pot^7q-+g9bJad5s$PDsmaSG^0f|My~XQx<-f z7BVTqAeywF55X|U(r}r}9BQ%gwbP0CvW2Ir9X&GMd;!c0S#kwE6XxNA{JVG;o^jMb z_1g!0pF(EhJoJKJ*fOqU7oq>74l<>2Bo&)Y0kE{P7z{Uiq#XPnM z6+2#eMSzUC>VkdS#U_*lvGjfI`aZk{qX_wi?>`y^Q8lA(;PEE^u{o#?lJ}8(07N$@ zU1T7m2s(+moFQ+aiRmd={Memy?AZ?S=+cFAmzL%f>U(Wl+r@4M;V@_Do3q!cQ@u`| y+3VDS*Qqmmoj{+<+3TdRi>qZMxb2C1JP((%T6wsJg_>iRYmf$GoKgFqMf*Q8n`u=5 diff --git a/prismatic/models/backbones/llm/prompting/__pycache__/vicuna_v15_prompter.cpython-310.pyc b/prismatic/models/backbones/llm/prompting/__pycache__/vicuna_v15_prompter.cpython-310.pyc deleted file mode 100644 index 66edffe7a54de8075705125c40f9f3b031c4a75b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2622 zcma)8&2Jk;6rY`4?>ZYh32jL~0I^b`VA|M3C{&7~h)@us0ynBdB`l=XdS`5JvR|Fq zO^CI*&`9l-E2UDxtrz|T{szu_<kQVn%I)FY#7fNsEF?76$kD{d;1D9yNNru(-21@8wjmyYLbs^QP`AY@!P{X{rD)Ps27 z+*6yEc9)wg&aI)BP1ee$ZQtd6E_m$oRcDxGsa$QfhWTIsyZWBbn|{&?Be^eI}TK-X=r8jPMQTt?1iA$&zLa!WyA6DvL4U&IFG?ZKNmrgODC6Lp%*h~jwiC9 zANYY6IzgQAFboE0j^jxg$jpngrn5CX)pZ8JF8Dj-VcO5b8xG@{7lfSMPzR<-7-X4*NOU-G}k-jF^N5)!9uKOC?eDa}==5(OR8VIHpq2Lf`9DDm8D4U>qsL>>%zJlON% zK?`_n-TP)^X@hx~B&{jUErpzH^*n#4mq5<6!Z3nqOeP5TNrb9|O+Y0Ll}p zASsE!S_4U@^F#^+4bU(o5Bp;{=3qq3gLn^`GV=;)P_cl+hIjT=Sdxh#6;;&r9$fSWYWAM+Qg@g~UR+j{K1j3D979*y7;>(kh4f^{xf{G3wK-x zO)toi0qFuMaiACafe|mBId%19{-9D+VAg_u1=k;#g?V5UV9>T%7&i%pL{dGd@2W=% z1L&=`pJ5~@p32G?XB{9OS!+QA4z4R2pD7wG@ir_iEx}VLSX^AhjmVd{i~<8mAhHc> zOvSr6;D9J+p;ovP$jW)7G39JJP*>MZyR&fpN#{krDe}sy$C!(xH{e(my4SP?&DIw* zQ=8LlkTx~6IektyUeD*rIX-%S@>M{Y61WMS3@OweXo?gHIYPz7DJj~76k6yvfwp)? zQ30a>J4MK07X>nafMe~*I>zcm+vAL=mO6yHLQ8@A0>-i?c$Dn&RKV-{qqqcCKy6jI z(Ws1Zb#QEUoB|O1>AtuIBjQ68)A}^V>%xF%Zkl8~&ft}s)angbC@~joipbf3UsYS6 zqt&O_8|u6sZC!tso_juxmSz|tT-LeWC8 z45F-0YKRX%51@{`F!CK-qAH3B8V0|_GFc^#wm5h0(&D_p@pi*1E!Sm<5BHkcehTGY zT~>ZNeI1(P*P%Im9h&euG^eivj7vWK!1GI9PQ~&?~ diff --git a/prismatic/models/vlms/__pycache__/__init__.cpython-310.pyc b/prismatic/models/vlms/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index bf8ca2fcd2b2729e3cfe8c39a72a7af2ac5340b4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 206 zcmd1j<>g`k0S(2Zz;Z3|{Iuw=7YaibAT9PflB-*J}d8@c_VL{B0goFn( z14t%IH@j+oK>on)(U<)lIqrF?a>^;^%}J+p&kzzV%UcN~G<#-xW_r4NzHTJds-?m6 z_{X0H|D4mbe_><#DM90|rs@1gV2IYyh)#@9kBp9?BVPy$k=Ze$Vy7tEW>|{KowDSM zVI`XB%tThlie@{rvab}{9UJDA!fI6O)MQ^doQvw6dNkjem+eZp5S{Csll)A$7%g>{ zByWYw(fQ7K$@aR%=1&n4?WHuLVH0Rq(KrpNzd_|=K%+PWy_2Enb)Vzc0y?A+TskYeh62B!oAeEDgdS9f4@td0{Y?6P+;W2Y$=h z83f#c|0v;UIC9birllF9D?^sR(&>OYJ}7~Qxa}Maf{;2MXpm&gp)ZG&1rd!?SRv-A z7pH-jQme<3$a%&B9>GpN=-$L-5=Nl~b4S>O9dXHt>U@x7A#u9Y>C>1pFLY>}Wc>lE z<0t!+!Jep~+&V&=aOWm%^;=En(Y^bdk2bz)TF=(D9yOiKY?L)X8>zTDNz1anrcMT% z9UK2-Ys#HleCV+t?uVmW&dN$|2#9yJ!46l_(?bwV9H0-BQ5LS zc*7dHn4OBFh9L@T>l>n$@2J8Fq|9etI%pI`Wdoe&bweu5t$?SZ@^pyN<%MmSJNH03 z4m-Akmq9G*CuiG8;$F~)m6kWWjC+mye>j}2HFqERdP+rTXOUv`GEXdj?O~+5xGG= zl%qB+&zwK&k31$%$S34eIR`+3Y?05%Z8>KS^w}m~kUO$fCts4gueHwncbchn763rn zq5!vNN7ypt-JTZ(;Ybt!K}1D$1w_p4OU!V)6Z5eem_XQJ7@c(4*maG@=sh;*rLcY z02xtnU4$>!Wp&t>Eg(6EWD&^{5HSbQ>4H_@Zn=AO#Kn?4m^$)QcL6LT<`vFhxVpX! zPcaw4MTA{&@7RlIqa;jD!`^#$e4~pQ*Ol1ox^J|jjnzRC(N&fO0~+@a0K!-ILvQu@ z*4oM%@rEg7tHT@=SEc(n2t<5!DnYQza1;k#j$X7XHNB>nbh}^|EPZT4Ftl<*9NB5Y z{6UM*{lKTb4&wQPJo;$^^Cqs7YKPKKuk>95?5vN2^@OqH002v3^|xRt1tEYZEvXvB zgNu2DL)Q8hVbLE@e~&u=m0%k<3m^+_eA|IEmPG)gavDP1t*7n#wn`3eVxdxCw!pk9fYPG`H!egr$VhO@lx)mZ=YBWrOIAwVKMQQKA zV}1Ts$d+LM&G7_&el~%@KzpON<(`eQDEJ{~zlHuYz1+lsx3v>g%^TJ0z%`D-yuJmN~HoCFMn!$h?U zsM=W~4AJhLYxN#g*ezV})Xo>s+&xZ)?g8_LL%dPLBnp!mE0?5iLr|ycXa94 zIO@J{C2@a;#ynv!AXld;R9^^Uk;nIrmQTs%a`q4)KXmuKFr)nF@+qlJIDVRomLS7J zbtS422)Rq6E+t&d4Ixh>tmcbcKVAfDfp3&<(<{9YGf;B5z6VyH@-U{D7h*n<1%P|3 zKNpLWu0cScSaKmOQ_97xN=6VSekvBCEKCEy87~B!hAzt4EP4skTmB&b9xpX)6)B}EfiVRDwehSBbk*-e|65P46*r-q z=m5kGUs0gBu}f=;^zFTV$*q8a7-k+Ep&hG{@WW?1kV&@%F0qhRRo{?-0t z)Iq9X>vdDE*tjCfu1gYMonL*Qf(8a`A_`vDXLyn1`9ygL$uJDMlj0^FVFr;|iD6a4 zDmbSofmc)%{>HH|;I*gAIW~jBXnpak>Hd>4PXVujQgQXTL{$R3m48?878K45FO9hQ z%R((x>7cq8;Nto(h^vnm_03Y+mSwuM)Ra>sE}~OJ<#PaksPFF}l=4eJOwF#=!4r#m P1!!HjbsK(d{gU}#znHwD diff --git a/prismatic/models/vlms/__pycache__/prismatic.cpython-310.pyc b/prismatic/models/vlms/__pycache__/prismatic.cpython-310.pyc deleted file mode 100644 index dbb0f5e5d513f7891f2830fdd0cd6d864248ca5c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 15520 zcmbVTYmgk*RqogH^vur8&OWsd$!^Jtt%;>oNWg%jh}e?l$XRa`d+o#%i`miK-rZUC zJbb%HkD;fcL<*7tCE-nh1FJxwOaa9+JgO*O#jD^4&!Xrmen5aCRa8M$s1$5$i|?H7 zp2un(Y(}+Lx9{`xx#ynq-E(fY^7*WS-&4Qv7n}2*qI`)ShJOYg-mfUC_ixAurK|{5 zXs+7O%9=`f-PIdL+2Fk4rW$71|SN$dogUY&qK)D~~mD_rAvc<@-55?%Ium<%5kw|mRD3%|5QV|nk@`fTN zodehP@hJ{Te7xkKiv`?*Rh4OUT@Z$8+O$` zm$Z9+`RSu}t0V0Vr|HP5YuB5;BiE}n$6jyAK^K41u`g9+z18t-QD0woq|@|m^!Ct$ z2e{ml7nicxm6q>#k7jNBj@f50M6-4IsC^l;x0(<6w%55(bE}?bulcgt^e{)mk>1id z>EOBEbOdO1+%@|MdY-CwysEqWbZN;x+w^>=DtI9D?lc^{#3e^wwcUE7?mL18_8gz* zwe8lrO$2`j57q+}$x!xV=gy+4>f27Udck$*WxRIP2b263_vti+Bi*aSrsd_Q9|EBf zXMAVP?)Y`L?$^=l$hzYQYEy5vJH8iB>u<_dXJeDuQR{eqt6{fgYs;zm^;XmNHtUVj zQufOvtR*!Zt2Uc0A8XZWdU#B?tJ1GmU1W`ORe!T&hSsU7>k^frak^gfLu0w_`Jwfp zHub5xp|N_k?S!f4ns{76ztq`Ab;BuX;Z&u;D_*%QtL;j=<<@Ig!)&D`<2>5VCLH-t z$HyCdr0#E?t9m|a#_-yYS8Eq9w3-ec^XFu%(e@wj)Lk$J1%>D99wr=WqC*T&F%99O zH>h`=N(*aqnb=y=!-+E;59_*o4kHmSS~478#!^@1pddF~U4P~7-Mckn)9}xX9#roy zka3izA> z&WZivzzw6EM(KxzEe@j863>VS#9@?XGBbPq6VHlO@rd|lw9O9Ji|54i;=SSo%Ez4C zcK(i%nZdXpVVX{f(>L;%zYx`kk81UHzmgER2`iP(t%ehIWKB2s{CKS@WD`BRucuT50K-@Q77V&OjouvO*vYK)ZmEFU`Yi(b~=CjvE zm%)CY-JM#q>mKwcY0Y-KC86dHo+;^a5ktuRGzq4Sw^Yf1I*=WxgELjvb3*+E#|yQU zRt?M4|7pmj@b@e&APqolI-l*SzT#s$UB^Q1L;AhiOB!BF4b{h>O!V5O4E2^2$ZJdQ z3f20nid;hHozurRTMg&9?9?}%=Emh}bL03Ww|e~f<;RXaCaP`Nk>h=~9A_c+(4gTB zN@1nK>h{*j)w=~S6rAmdAOQo`O!fn;?f+vwldVP5TJwWYbRftHA zt8T?_U38k>m+9@h*0a7OU45wg;348D*Fj4OyXx7EmUPG>z=$=Gp>78ybt}z;DexUT z!b)ZjQ&3*D&2X$yZFVq;2O|(>oW=$0FHvvE`_WC#P;wYaXgN*c(c;Q!Dx0OU?DJK( z<2)fHwBUhVJGmmAmpXOnc$E!V73Fbg|3r~rbgp{g4DVbr2qVQ=Xa&69yX)dgn1WH= zaAM&|-V=M;Y1W`bLfv(mrA%mej=L`BiKxCTvDES#gyzM|Rk`7n3mA6;qgA2_@J!r+ zBDzu`=D(_jnM#GNd8NVk#dZr=MH1W#3+vKxV0JD$^^Hy6t63;ae$ZAd z^dBUl|4}>yN~EpUb!aD3sIO>&^;7Dn)n3MjHVc3jg??KT#=O$Yid2xjq{#cAZScQ$ z2>KvR`Xn!1(YRk7EI7_uqximchr6<8488=@)FYb3cm z`&wUDt=W&QoVM47E!NsWmc%oqjC=%36dG=;DxzI9PFaXx)pxMTYJMc#293*o!otY5 zsf31e2{395ip%xn8I1O(k}jX5+EX;44#gA}BTG9p7wNfyXouswUi&#zdq+=933A-iAmz%I26e}#1r(E+t{nThwz?5c^!G3%7w8(XaKkbY05XZ(%8?$ z9SauS{G2Xe^Kau+$Tb<_)}A$4)MQ3`d{%9~AC!z?e}C5=l<@2*AEHF$Hi(M1Moa$f z(9--GC@FxFzwxtMV?<3Q7t@r`LEK65eVTxa#u+UcLhtjZNIm!D!rP+_069;TaVwT5Z^toqUudvbL4|3UA*J?ORkf|v_b$l-j1MSXL7Tr7~Bb8fMpP?rJOWT4a5Z>j4shzM`|j1Fa3ZvUgvd70DVVl6`~>h1@b4o| zf^Kd9a?AD{oP(k2qjsKs?AS^B5YChv%|mvzDeOaVVc>kybPg@`hoEuZH)uv(b|I^G z9<_(9%{%QXKpzxu*J-{NM7dY_=s#?s2qs1jiF@bilfcyNo{A7=GW3NOA^Rc2N3Sqy z_?VRbRQ-2)l-SUSxbP_4HIbnu9x?;T>2jjam|%!4Ha&D)_t=0+lmM3><4F!tJw0)o zxaeem+AzhuKC{ZUxAP#h@5jzCpUxAP|MMrl@I|otr2RY}CGErsV6W|{Yw|s4O`x#m z10Xt=aDIK@&HBT~Fx0tYQ@hn?9y`18*yGDjoX0#r@CBadTynm#*G`hr3$5g}x|#kv zu{n$8+G~u_xpg;9CPseYi+yH}GNME9{4GvA zybOG#nxt3pAkCNSwarMfq8AzB(eO)9p6PhA7hz)FTIV5Ry6I<0o!D#LNA5i^xH0;aP_MbP>LJ-!!7nCZLBLIXg(6=v%K3YC2_(ZP~|ae5ta*niKv!<6vJ z`?pA|UE$TPB(D{p4n6rO4V)6rh0aD;Y=SDdf%?>i`4OdFN=0bz1?po)YCJTkQ<&mT zyWw6jK6530DDM;K>0Lo$Wpe<$In~lleL-Kqxz5s}5}ft8WC0~vZj)07W!bn@PLKN< z3+e*ula@KX5Vhv=qLIZISpaJbsd3GCqkCWkVD6?bOv_FaM+mu7=tFAqw4SdKz)XJR z;G$MoipM zUo>7;URHalm$jS9P4%X>u8T=Ab;Cl7?DiOeg6nF}^mBm;H&+^Nu1k2gX+GAPVrE_u zvx|z@w+J@$baH3PA7`58-abu$Ec4uJA>vtSv3qhE$GPNyN`|sAd}KUQU$O_1q+V?0o@rPmTN zfW_j$2Z!&3o83diTN;Pi3O#uVIz-r~`rZ-%xsMUoo}z@T+m8D1Jsv{Z0aGOXWSQ7Q zIM;()gpOOc6iB6V3%lTRkVT38fKpAUw=|Rh)Kf|9J=A0i-MjgRJ<`z85uj))3j)HLWA@D&iw!r~hz6-4odQ&_)2AhheO{?1LMO}Mc zQ_Z{GZyMSE>^C4z47rQ;@*j53KHYKsdK)?%L8q0L4HsQq^f4D0cp6SQid#_R=ZPzD zeF6B{r&}FT`^;SMwbS(A!~khZI5zOhc-?6iXckytP`1?eyN5?s>U7Pcok9}FuEU_ zQ17rbVH$BX3H2x+LkqYJ=|v+OQc4eOgKSWLcB?!??bJ#+%MwYN)r*!&QWN$GOz*V_ zUqLlH2j{e$Zoo5O-ZgX!${l(9KZ_E0OSEZ}Sep69bZSnevT3!bbr1fZODxG{Fe22X z&Z+QJBGhyu+g9%=5NkfyA*cDeZ}{*L_~}iZJSIIIB~}#K*w7K#ARmg}U4Mq7BRGrL zXP!NM&K?BSY+6A+M{vN6WCf@lkrb>dG#MY4u!wc+N%)TK#BAC}=uAKnSVYn6C{|Zm zfQIQ#N(P3PjQqCY9QjbzJ<;YsUmH6=4)8??gH>e?4I`g4`Gz7-bRM~q z!E0JxjL$E74O5<@Q`4R`_qCG<@}Bj(z=HfZW<;1<@yJ(Cc3aam*lqI-&4S|$ZZiCP z-F&0_j*;bkv)2nV9Cn2#+S^&^+j92JFD?{rx8!+*wtSy3RI6EORK1Hkv%A_R z?Q(5|$lMCHSI6;o7I(Ee0~z*{rX~Wij_ZZ_DDYch=f4+@w+V*9Cz%X#us;X|Z9f*# zHayb#FoVEQrG`_qGXzK|BzHcXXmM1w(x=}Gr{ki{x&SVW{Gnbr7Ks=;M#ISl=`n~6 zLK7ruHOG?$@ViDbko+h{LC=hDUPMYdT&S@6U9-46Mf);4c5nAvim-w^V=Zx+X8e#gQX~vyL#Jux?nl zIIwp|?d6*Y>uu+5feOM$z5GW&jR8D!U00Q+^$m&_Tvk;$-BslUa=k0t`5=#W1)=rE zi3)JwCqQo@DBvC8(&zudBg{l$O0?nkFwh!&+AanvdKb}SA}G?E3-wbNMQEP__M+Zg z3JwjILa6{5LeWfM*2#F*f8a5+WQ=tUy)he}xi>xc`%R+X6a`&7DjYlcz3KHQ;rJQH zAMxGjP4oE1$oSI%J>NV26xvR4xSB$z2#ZGR7==$!XWmkx)$Cy{gPCY8LCL#t*y}1{^-X zOwF2OAE~vP2To2{%kYG7BQViN^3R%n0pY&OrS+*3g;3O3Im)G<4Rr`GOmOeze3ubg6`)t zgcxt(Z(gB2`28GF9vFpJ6TEj3xzC`Jyg|tzmXJL`p&kwj)0AJKSr(M;krDF{(>jFd z`khwchtd&}y@yo70t{{`4$t7$Vp5e$L*R8lXaY=i1U(TxVN&~4G8+9^vT8)$-2YA9 zAY@G)&Xo26=5AIIV&-^E4w3@(9R-J@FqnzcI>mDtks&75+>qako?#JJoj6EbiSBra z$cK=HS1uNl(BN1Zhj$NC7lS-r(OQm25$nDUMWn3ihq<#)u6*d3Cn{%GPCxO{(1hWH z4V5RUZ>sCamKU0rAcD1vrTZf^cnwu7Z_(nhaIu)N!0jPs%)Zc!3|YCr*RCq-P8A@@ z@yZMB$k)eC3fvruOe-B{yfV-R-*f2K^$jB%_OKYe3%^&?V`j(}4i(Q26_1Wlo*1f% zjC^@^_q1_&vC|wiv*jY+@)#z0Ix5*UNiHgl$Jy5(XFwu6G@?A)uNd^F@~M7#Gy*ps z6b?oy4~8C$Kp3bzzH5j`)_;)Jk`a44-c{d1a{ccpo}DlveAbWSN1;yht43~Cv*6`f zPzlCIrK73&)j}Z)|4%{FuI+yHu_iby}gqWEXNma`0f8l%%GPD`0PhDG&g4Y(hd zjKQ}v)Wa|j(QzR?7me_G0b^n$bx>lYiv95obC91+&(bjhhYB1tfIjd2XMVLiy_bUF z2=PuX!f+7~+}U>;`iUa4h_Tb1kfj(P*68${{E`b(eD#1%OF>HJ$e$GB1E2)5=ukk0 zP=Pt-aZySIYMMtt&Zw(Bt7!-$P$?H!+tIZJy0VaZ1zsoM77Irbv-|CjW1qGzTrJNfoR75 z5Jzp!a>7pD^XErBlXTCIBnlT=RBsv<ZDB-VlI!fZf=6r>}-u+kaIHvpyVK z@s%LlvmtjzB|-)se0kR}UZ_F;5hzKZl0ka|53B^f4HW}e$AF0>*E3OWDs$wtJ3g{Y z;~nncE-Ns=5VUh~jAlIesemx ztE&7RGz9qkZT!dN!|`~O@;`tiR4<_gzrXYK&S3X0d))7^ZC5Vc24DLqW78LaHw){Rk4C&49$0IoZqf~mMUqxnJ67@&W zSN8b;rmiWFUE;Pw{JIr{v*oiBcU|8B~t^1AYmC%eh zD1V0f>G+~Set_x1__!2n`na@@>-ZCq6!Jp-D(dAIDIw!o9_JkrSw}kOl{26Pn-hkq zJVDtJ8w6ii{2~>8F<#O#;3)2xVo7;`H1(p9I6BV7B)#Ra=-wxd=asV_?sdQ@iz{Av ze#qK+l@7jVkPY~PakI9Z8?KV4K`_KCWh!I;myhAcnz5?2uxAvna82at|+{JVA8EYrv9Uo6F3ZbNU?ao956{`@8*jCy#a) zzmBqjOwjie68057e7r4g-!VUHq+yaD)mr#tEBppBB)?m%KDPisdPWb|-Jx?}jo~YU z1%};4v<`aTgkKBxw3qhNLE5*rGO)i#aP1ryP5W)qu+3DPMcucu?>onei&%d7N63tZ?3NNC8L|p}?JED62Ky6A{*;nGqol9eGINS(Kf;EHuJIeh%C8QZ-h@>eBKEO{rwKny8|Oh}CS$=* zwhe6>i_x+1hNfGu?|fH_m;!U-lWw@hCRs3P?(3LOWM!V05&DR zfCMf>rvF!{fQ`$qQSOVB{5mDvXTZweKv~I>U!tlpO8%J=uHV&?%1RUT-2~kqsZ{Xo zS@Oj~CCpVSFChRH)$qpx2$@Z;Q}Stow(@3 zS1S?b@uW`A`0&r)bn6$QFBwc=5uC<3h$&6f%QPiqaU>o7WR?0kcDb$qr4Ql8^&823uLjQYF%yR}!_!AuYuasM(+aO;4 Date: Mon, 3 Nov 2025 09:31:10 +0800 Subject: [PATCH 6/6] feature(wrh): add pi3 injection --initial version --- .gitignore | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index b71c432..ec162be 100644 --- a/.gitignore +++ b/.gitignore @@ -154,5 +154,4 @@ rollouts/ wandb/ outputs/ experiments/logs/ -evaluation_results/ -vla_adapter.egg-info/ +evaluation_results/ \ No newline at end of file