已经发布的权重里没有深度信息以及点云信息的权重

您好！
目前在huggingface中下载的MobileVLA的权重信息中，并未能找到深度以及点云的权重，这样不利于实机部署
如下：
(mobilevla) qirobot@qirobot:~/MobileVLA-R1$ grep -R "navcot_use_depth\|navcot_use_point\|depth_tower\|point_tower\|depth_encoder\|point_encoder" -n scripts train.py gen_worker.py llava | head -200
grep: llava/data/__pycache__/dataset.cpython-310.pyc: 匹配到二进制文件
scripts/train/sft_8frames.sh:40:    --use_depth_tower True \
scripts/train/sft_8frames.sh:41:    --use_point_tower True \
scripts/train/sft_8frames.sh:42:    --depth_tower depth_anything_v2 \
scripts/train/sft_8frames.sh:43:    --point_tower point_transformer \
scripts/train/sft_8frames.sh:47:    --navcot_use_depth True \
scripts/train/sft_8frames.sh:48:    --navcot_use_point True \
gen_worker.py:22:navcot_use_depth = bool(navcot_depth_root)
gen_worker.py:23:navcot_use_point = os.environ.get("NAVCOT_USE_POINT", "1") == "1"
gen_worker.py:372:    if not navcot_use_depth or not navcot_depth_root:
gen_worker.py:444:            point_inputs = "from_depth" if (navcot_use_point and depth_inputs) else None
llava/data/dataset.py:264:        if not self.data_args.navcot_use_depth or not self.data_args.navcot_depth_root:
llava/data/dataset.py:318:        depth_tensor = self._load_navcot_depth(resolved) if self.data_args.navcot_use_depth else None
llava/data/dataset.py:320:        if self.data_args.navcot_use_point and depth_tensor is not None:
grep: llava/model/__pycache__/llava_arch.cpython-310.pyc: 匹配到二进制文件llava/model/configuration_llava.py:74:        self.depth_tower_cfg = kwargs.pop("depth_tower_cfg", None)
llava/model/configuration_llava.py:75:        self.point_tower_cfg = kwargs.pop("point_tower_cfg", None)
llava/model/configuration_llava.py:76:        self.use_depth_tower = kwargs.pop("use_depth_tower", False)
llava/model/configuration_llava.py:77:        self.use_point_tower = kwargs.pop("use_point_tower", False)

grep: llava/model/__pycache__/configuration_llava.cpython-310.pyc: 匹配到二进制文件
llava/model/llava_arch copy.py:33:from llava.model.multimodal_encoder.builder import build_depth_tower, build_point_tower, build_vision_tower
llava/model/llava_arch copy.py:72:        self.depth_tower = build_depth_tower(getattr(config, "depth_tower_cfg", None), config)
llava/model/llava_arch copy.py:73:        self.point_tower = build_point_tower(getattr(config, "point_tower_cfg", None), config)
llava/model/llava_arch copy.py:77:        if self.depth_tower is not None and getattr(config, "depth_hidden_size", mm_hidden) != mm_hidden:
llava/model/llava_arch copy.py:79:        if self.point_tower is not None and getattr(config, "point_hidden_size", mm_hidden) != mm_hidden:
llava/model/llava_arch copy.py:82:        if self.depth_tower is not None:
llava/model/llava_arch copy.py:83:            self.depth_tower = self.depth_tower.to(model_dtype)
llava/model/llava_arch copy.py:86:        if self.point_tower is not None:
llava/model/llava_arch copy.py:87:            self.point_tower = self.point_tower.to(model_dtype)
llava/model/llava_arch copy.py:238:        if getattr(self.config, "depth_tower_cfg", None) is None and getattr(self, "depth_tower", None):
llava/model/llava_arch copy.py:239:            self.config.depth_tower_cfg = getattr(self.config, "depth_tower_cfg", "depth_encoder")
llava/model/llava_arch copy.py:240:        if getattr(self.config, "point_tower_cfg", None) is None and getattr(self, "point_tower", None):
llava/model/llava_arch copy.py:241:            self.config.point_tower_cfg = getattr(self.config, "point_tower_cfg", "point_encoder")
llava/model/llava_arch copy.py:255:        if getattr(self, "depth_tower", None) and not getattr(self.config, "tune_depth_tower", False):
llava/model/llava_arch copy.py:256:            self.depth_tower.eval()
llava/model/llava_arch copy.py:257:        if getattr(self, "point_tower", None) and not getattr(self.config, "tune_point_tower", False):
llava/model/llava_arch copy.py:258:            self.point_tower.eval()
llava/model/llava_arch copy.py:283:        if depth_inputs is not None and self.depth_tower is not None:
llava/model/llava_arch copy.py:284:            depth_feats = self.depth_tower(depth_inputs.to(device=device, dtype=dtype))
llava/model/llava_arch copy.py:290:        if point_inputs is not None and self.point_tower is not None:
llava/model/llava_arch copy.py:291:            point_feats = self.point_tower(point_inputs.to(device=device, dtype=dtype))
llava/model/llava_arch.py:33:from llava.model.multimodal_encoder.builder import build_depth_tower, build_point_tower, build_vision_tower
llava/model/llava_arch.py:72:        self.depth_tower = build_depth_tower(getattr(config, "depth_tower_cfg", None), config)
llava/model/llava_arch.py:73:        self.point_tower = build_point_tower(getattr(config, "point_tower_cfg", None), config)
llava/model/llava_arch.py:77:        if self.depth_tower is not None and getattr(config, "depth_hidden_size", mm_hidden) != mm_hidden:
llava/model/llava_arch.py:79:        if self.point_tower is not None and getattr(config, "point_hidden_size", mm_hidden) != mm_hidden:
llava/model/llava_arch.py:82:        if self.depth_tower is not None:
llava/model/llava_arch.py:83:            self.depth_tower = self.depth_tower.to(model_dtype)
llava/model/llava_arch.py:86:        if self.point_tower is not None:
llava/model/llava_arch.py:87:            self.point_tower = self.point_tower.to(model_dtype)
llava/model/llava_arch.py:238:        if getattr(self.config, "depth_tower_cfg", None) is None and getattr(self, "depth_tower", None):
llava/model/llava_arch.py:239:            self.config.depth_tower_cfg = getattr(self.config, "depth_tower_cfg", "depth_encoder")
llava/model/llava_arch.py:240:        if getattr(self.config, "point_tower_cfg", None) is None and getattr(self, "point_tower", None):
llava/model/llava_arch.py:241:            self.config.point_tower_cfg = getattr(self.config, "point_tower_cfg", "point_encoder")
llava/model/llava_arch.py:255:        if getattr(self, "depth_tower", None) and not getattr(self.config, "tune_depth_tower", False):
llava/grep: llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc: 匹配到二进制文件
model/llava_arch.py:256:            self.depth_tower.eval()
llava/model/llava_arch.py:257:        if getattr(self, "point_tower", None) and not getattr(self.config, "tune_point_tower", False):
llava/model/llava_arch.py:258:            self.point_tower.eval()
llava/model/llava_arch.py:283:        if depth_inputs is not None and self.depth_tower is not None:
llava/model/llava_arch.py:284:            depth_feats = self.depth_tower(depth_inputs.to(device=device, dtype=dtype))
llava/model/llava_arch.py:290:        if point_inputs is not None and self.point_tower is not None:
llava/model/llava_arch.py:291:            point_feats = self.point_tower(point_inputs.to(device=device, dtype=dtype))
llava/model/multimodal_encoder/builder.py:27:from .depth_encoder import DepthFeatureEncoder
llava/model/multimodal_encoder/builder.py:73:def build_depth_tower(model_name_or_path: Optional[str], config: PretrainedConfig) -> Optional[nn.Module]:
llava/model/multimodal_encoder/builder.py:74:    if not getattr(config, "use_depth_tower", False):
llava/model/multimodal_encoder/builder.py:77:    if model_name_or_path is None or model_name_or_path.lower() in ("depth_anything_v2", "depth_encoder"):
llava/model/multimodal_encoder/builder.py:82:def build_point_tower(model_name_or_path: Optional[str], config: PretrainedConfig) -> Optional[nn.Module]:
llava/model/multimodal_encoder/builder.py:83:    if not getattr(config, "use_point_tower", False):
llava/model/multimodal_encoder/builder.py:87:    if model_name_or_path is None or model_name_or_path.lower() in ("point_transformer", "point_encoder"):
grep: llava/model/multimodal_encoder/__pycache__/depth_encoder.cpython-310.pyc: 匹配到二进制文件
grep: llava/train/__pycache__/utils.cpython-310.pyc: 匹配到二进制文件
grep: llava/train/__pycache__/args.cpython-310.pyc: 匹配到二进制文件
llava/train/args.py:58:    navcot_use_depth: bool = field(
llava/train/args.py:62:    navcot_use_point: bool = field(
llava/train/args.py:105:    use_depth_tower: bool = field(
llava/train/args.py:109:    use_point_tower: bool = field(
llava/train/args.py:113:    depth_tower: Optional[str] = field(
llava/train/args.py:115:        metadata={"help": "Depth encoder identifier or path when --use_depth_tower is enabled."},
llava/train/args.py:117:    point_tower: Optional[str] = field(
llava/train/args.py:119:        metadata={"help": "Point-cloud encoder identifier or path when --use_point_tower is enabled."},
llava/train/utils.py:93:    config.use_depth_tower = getattr(model_args, "use_depth_tower", False)
llava/train/utils.py:94:    config.use_point_tower = getattr(model_args, "use_point_tower", False)
llava/train/utils.py:95:    config.depth_tower_cfg = getattr(model_args, "depth_tower", None)
llava/train/utils.py:96:    config.point_tower_cfg = getattr(model_args, "point_tower", None)


(mobilevla) qirobot@qirobot:~/MobileVLA-R1$ sed -n '1,180p' llava/model/multimodal_encoder/builder.py
from typing import Optional
# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# SPDX-License-Identifier: Apache-2.0

# This file is modified from https://github.com/haotian-liu/LLaVA/

import os

import torch.nn as nn

from transformers import AutoConfig, PretrainedConfig, PreTrainedModel

from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2
from .depth_encoder import DepthFeatureEncoder
from .intern_encoder import InternVisionTower, InternVisionTowerS2
from .point_transformer_encoder import PointTransformerEncoder
from .radio_encoder import RADIOVisionTower
from .siglip_encoder import SiglipVisionTower, SiglipVisionTowerS2


def build_vision_tower(model_name_or_path: str, config: PretrainedConfig) -> PreTrainedModel:
    ## skip vision tower instantiation
    if model_name_or_path is None:
        return None

    vision_tower_arch = None
    if config.resume_path and "radio" not in model_name_or_path:
        assert os.path.exists(model_name_or_path), f"Resume vision tower path {model_name_or_path} does not exist!"
        vision_tower_cfg = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
        vision_tower_arch = vision_tower_cfg.architectures[0].lower()
    vision_tower_name = vision_tower_arch if vision_tower_arch is not None else model_name_or_path

    use_s2 = getattr(config, "s2", False)

    if "intern" in vision_tower_name.lower():
        drop_path_rate = getattr(config, "drop_path_rate", 0.0)
        if use_s2:
            vision_tower = InternVisionTowerS2(model_name_or_path, config=config, drop_path_rate=drop_path_rate)
        else:
            vision_tower = InternVisionTower(model_name_or_path, config=config, drop_path_rate=drop_path_rate)
    elif "radio" in vision_tower_name:
        vision_tower = RADIOVisionTower(model_name_or_path, config)
    elif "clip" in vision_tower_name:
        if use_s2:
            vision_tower = CLIPVisionTowerS2(model_name_or_path, config)
        else:
            vision_tower = CLIPVisionTower(model_name_or_path, config)
    elif "siglip" in vision_tower_name:
        if use_s2:
            vision_tower = SiglipVisionTowerS2(model_name_or_path, config)
        else:
            vision_tower = SiglipVisionTower(model_name_or_path, config)
    else:
        raise ValueError(f"Unknown vision tower: {model_name_or_path}")

    config.mm_hidden_size = vision_tower.config.hidden_size if not use_s2 else vision_tower.hidden_size
    return vision_tower


def build_depth_tower(model_name_or_path: Optional[str], config: PretrainedConfig) -> Optional[nn.Module]:
    if not getattr(config, "use_depth_tower", False):
        return None
    depth_hidden = getattr(config, "depth_hidden_size", config.mm_hidden_size)
    if model_name_or_path is None or model_name_or_path.lower() in ("depth_anything_v2", "depth_encoder"):
        return DepthFeatureEncoder(hidden_size=depth_hidden)
    raise ValueError(f"Unknown depth tower: {model_name_or_path}")


def build_point_tower(model_name_or_path: Optional[str], config: PretrainedConfig) -> Optional[nn.Module]:
    if not getattr(config, "use_point_tower", False):
        return None
    point_hidden = getattr(config, "point_hidden_size", config.mm_hidden_size)
    in_channels = 3
    if model_name_or_path is None or model_name_or_path.lower() in ("point_transformer", "point_encoder"):
        return PointTransformerEncoder(in_channels=in_channels, hidden_size=point_hidden)
    raise ValueError(f"Unknown point tower: {model_name_or_path}")


(mobilevla) qirobot@qirobot:~/MobileVLA-R1$ python - <<'PY'
from inference import NaVILAImageInference

m = NaVILAImageInference(
    model_path="/home/qirobot/HDD/MobileVLA/weights/weight/rl",
    use_flash_attn=False,
)

print("depth_tower:", getattr(m.model, "depth_tower", None))
print("point_tower:", getattr(m.model, "point_tower", None))
print("depth_bridge:", getattr(m.model, "depth_bridge", None))
print("point_bridge:", getattr(m.model, "point_bridge", None))

cfg = m.model.config
for k in dir(cfg):
    if "depth" in k.lower() or "point" in k.lower():
        print(k, getattr(cfg, k))
PY
[2026-05-03 17:50:34,410] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
2026-05-03 17:50:34.944 | INFO     | llava.data.builder:register_datasets:19 - Registering datasets from `default`.
Loading model from /home/qirobot/HDD/MobileVLA/weights/weight/rl...
Loading checkpoint shards: 100%|████████████| 4/4 [00:02<00:00,  1.45it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
depth_tower: None
point_tower: None
depth_bridge: None
point_bridge: None
depth_hidden_size None
depth_tower_cfg None
point_hidden_size None
point_tower_cfg None
pointcloud_points None
use_depth_tower False
use_point_tower False


(mobilevla) qirobot@qirobot:~/MobileVLA-R1$ python - <<'PY'
from safetensors.torch import safe_open
import glob

for f in glob.glob("/home/qirobot/HDD/MobileVLA/weights/weight/rl/**/*.safetensors", recursive=True):
    print("\n===", f)
    with safe_open(f, framework="pt", device="cpu") as sf:
        keys = list(sf.keys())
        hit = [k for k in keys if "depth" in k.lower() or "point" in k.lower()]
        if hit:
            for k in hit[:50]:
                print(k)
        else:
            print("No depth/point keys")
PY

=== /home/qirobot/HDD/MobileVLA/weights/weight/rl/vision_tower/model.safetensors
No depth/point keys

=== /home/qirobot/HDD/MobileVLA/weights/weight/rl/mm_projector/model.safetensors
No depth/point keys

=== /home/qirobot/HDD/MobileVLA/weights/weight/rl/llm/model-00001-of-00004.safetensors
No depth/point keys

=== /home/qirobot/HDD/MobileVLA/weights/weight/rl/llm/model-00002-of-00004.safetensors
No depth/point keys

=== /home/qirobot/HDD/MobileVLA/weights/weight/rl/llm/model-00003-of-00004.safetensors
No depth/point keys

=== /home/qirobot/HDD/MobileVLA/weights/weight/rl/llm/model-00004-of-00004.safetensors
No depth/point keys

请问后续是否会继续发布完整的多模态权重？

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

已经发布的权重里没有深度信息以及点云信息的权重 #8

Copyright 2024 NVIDIA CORPORATION & AFFILIATES

Licensed under the Apache License, Version 2.0 (the "License");

you may not use this file except in compliance with the License.

You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software

distributed under the License is distributed on an "AS IS" BASIS,

WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

See the License for the specific language governing permissions and

limitations under the License.

SPDX-License-Identifier: Apache-2.0

This file is modified from https://github.com/haotian-liu/LLaVA/

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Uh oh!

已经发布的权重里没有深度信息以及点云信息的权重 #8

Description

Copyright 2024 NVIDIA CORPORATION & AFFILIATES

Licensed under the Apache License, Version 2.0 (the "License");

you may not use this file except in compliance with the License.

You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software

distributed under the License is distributed on an "AS IS" BASIS,

WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

See the License for the specific language governing permissions and

limitations under the License.

SPDX-License-Identifier: Apache-2.0

This file is modified from https://github.com/haotian-liu/LLaVA/

Metadata

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Issue actions