-
Notifications
You must be signed in to change notification settings - Fork 6
Description
Dear authors,
First, thank you for your excellent work on DMD3C. I’ve successfully reproduced the results on the KITTI dataset with good performance, closely matching those reported in the paper.
However, when I tried to evaluate the model on the NYUv2 dataset using the same pretrained weights (dmd3c_distillation_depth_anything_v2.pth) and adapted the inference code accordingly, I observed a significant performance drop. Specifically:
The predicted depth maps are blurry and often inconsistent with scene geometry.
Input preprocessing (resizing, normalization, intrinsic matrix scaling) has been carefully matched to NYUv2 protocol.
It seems items are in certain several plains, failing to distinguish themselves with each other.
Could you kindly confirm:
Is the provided checkpoint also expected to perform well on NYUv2, or is it trained only for KITTI?
Is there any official configuration (e.g., data=NYU setting, normalization scheme, intrinsic scaling, etc.) we should use to run on NYUv2?
Have you encountered similar results when testing on NYUv2?
Any guidance or clarification would be greatly appreciated.
Best regards,
My code are as follows:
@hydra.main(config_path='configs', config_name='config', version_base='1.2')
def main(cfg):
with Trainer(cfg) as run:
net = run.net_ema.module.cuda()
net.eval()
# 读取左目图像
# base = "datas/kitti/raw/2011_09_26/2011_09_26_drive_0002_sync"
# base = "datas/kitti/raw/2011_09_26/2011_09_26_drive_0048_sync"
base = "/data/user/Proj/BP-Net/DMD3C/datas/nyudepthv2/val"
base = os.path.join(base, 'official')
# 创建输出目录
output_dir = "outputs_nyu2_1500/"
os.makedirs(output_dir, exist_ok=True)
image_type = 'color' # 'grayscale' or 'color' image
mode = '00' if image_type == 'grayscale' else '02'
height, width = (240, 320)
crop_size = (228, 304)
image_mean = np.array([0.485, 0.456, 0.406])
image_std = np.array([0.229, 0.224, 0.225])
num_sample = 1500
num_mask = 8
files = sorted(os.listdir(base), key=natural_sort_key)
for i, file in tqdm(enumerate(files), total=len(files), desc="Processing files"):
file = Path(os.path.join(base, file))
seed = i % num_mask
idx = i // num_mask
f = h5py.File(file, 'r')
rgb_h5 = f['rgb'][:].transpose(1, 2, 0)
dep_h5 = f['depth'][:]
rgb = Image.fromarray(rgb_h5, mode='RGB')
dep = Image.fromarray(dep_h5.astype('float32'), mode='F')
t_rgb = T.Compose([
T.Resize(height),
T.CenterCrop(crop_size),
T.ToTensor(),
T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])
t_dep = T.Compose([
T.Resize(height),
T.CenterCrop(crop_size),
# 修改numpy
Lambda(lambda x: np.array(x)), # 直接转换为 NumPy 数组
T.ToTensor()
])
rgb = t_rgb(rgb) # → [3, H, W]
dep = t_dep(dep) # → [1, H, W]
dep_sp = mask_sparse_depth(dep, num_sample, seed)
rgb = TF.pad(rgb, padding=[8, 14], padding_mode='edge')
dep_sp = TF.pad(dep_sp, padding=[8, 14], padding_mode='constant')
dep = TF.pad(dep, padding=[8, 14], padding_mode='constant')
K_cam = torch.from_numpy(np.array(
[
[5.1885790117450188e+02, 0, 3.2558244941119034e+02],
[0, 5.1946961112127485e+02, 2.5373616633400465e+02],
[0, 0, 1.],
], dtype=np.float32
)).cuda()
K_cam[:2] = K_cam[:2] / 2.
K_cam[0, 2] += 8
K_cam[1, 2] += 14 - 6
rgb_batched = rgb[None].cuda()
dep_sp_batched = dep_sp[None].cuda()
K_cam_batched = K_cam[None] # [3,3] → [1,3,3]
output = net(rgb_batched, None,
dep_sp_batched, K_cam_batched)
if isinstance(output, (list, tuple)):
output = output[-1]
output = output.squeeze().detach().cpu().numpy()
rgb_np = rgb.permute(1, 2, 0).cpu().numpy() # [H, W, 3]
rgb_np = (rgb_np * image_std + image_mean) * 255.0
rgb_np = np.clip(rgb_np, 0, 255).astype(np.uint8)
cv2.imwrite(f'{output_dir}/{file.stem}_depth.png', output)
cv2.imwrite(f'{output_dir}/{file.stem}_image.png', cv2.cvtColor(rgb_np, cv2.COLOR_RGB2BGR))


