diff --git a/.dev_scripts/gather_models.py b/.dev_scripts/gather_models.py index 58919fd444..e09dc9e230 100644 --- a/.dev_scripts/gather_models.py +++ b/.dev_scripts/gather_models.py @@ -3,17 +3,28 @@ Usage: python gather_models.py ${root_path} ${out_dir} + +Example: +python gather_models.py \ +work_dirs/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d \ +work_dirs/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d + +Note that before running the above command, rename the directory with the +config name if you did not use the default directory name, create +a corresponding directory 'pgd' under the above path and put the used config +into it. """ import argparse import glob import json -import mmcv import shutil import subprocess -import torch from os import path as osp +import mmcv +import torch + # build schedule look-up table to automatically find the final model SCHEDULES_LUT = { '_1x_': 12, @@ -25,6 +36,7 @@ '_6x_': 73, '_50e_': 50, '_80e_': 80, + '_100e_': 100, '_150e_': 150, '_200e_': 200, '_250e_': 250, @@ -35,16 +47,18 @@ RESULTS_LUT = { 'coco': ['bbox_mAP', 'segm_mAP'], 'nus': ['pts_bbox_NuScenes/NDS', 'NDS'], - 'kitti-3d-3class': [ - 'KITTI/Overall_3D_moderate', - 'Overall_3D_moderate', - ], + 'kitti-3d-3class': ['KITTI/Overall_3D_moderate', 'Overall_3D_moderate'], 'kitti-3d-car': ['KITTI/Car_3D_moderate_strict', 'Car_3D_moderate_strict'], 'lyft': ['score'], 'scannet_seg': ['miou'], 's3dis_seg': ['miou'], 'scannet': ['mAP_0.50'], - 'sunrgbd': ['mAP_0.50'] + 'sunrgbd': ['mAP_0.50'], + 'kitti-mono3d': [ + 'img_bbox/KITTI/Car_3D_AP40_moderate_strict', + 'Car_3D_AP40_moderate_strict' + ], + 'nus-mono3d': ['img_bbox_NuScenes/NDS', 'NDS'] } @@ -144,15 +158,13 @@ def main(): # and parse the best performance model_infos = [] for used_config in used_configs: - exp_dir = osp.join(models_root, used_config) - # get logs - log_json_path = glob.glob(osp.join(exp_dir, '*.log.json'))[0] - log_txt_path = glob.glob(osp.join(exp_dir, '*.log'))[0] + log_json_path = glob.glob(osp.join(models_root, '*.log.json'))[0] + log_txt_path = glob.glob(osp.join(models_root, '*.log'))[0] model_performance = get_best_results(log_json_path) final_epoch = model_performance['epoch'] final_model = 'epoch_{}.pth'.format(final_epoch) - model_path = osp.join(exp_dir, final_model) + model_path = osp.join(models_root, final_model) # skip if the model is still training if not osp.exists(model_path): @@ -181,7 +193,7 @@ def main(): model_name = model['config'].split('/')[-1].rstrip( '.py') + '_' + model['model_time'] publish_model_path = osp.join(model_publish_dir, model_name) - trained_model_path = osp.join(models_root, model['config'], + trained_model_path = osp.join(models_root, 'epoch_{}.pth'.format(model['epochs'])) # convert model @@ -190,11 +202,10 @@ def main(): # copy log shutil.copy( - osp.join(models_root, model['config'], model['log_json_path']), + osp.join(models_root, model['log_json_path']), osp.join(model_publish_dir, f'{model_name}.log.json')) shutil.copy( - osp.join(models_root, model['config'], - model['log_json_path'].rstrip('.json')), + osp.join(models_root, model['log_json_path'].rstrip('.json')), osp.join(model_publish_dir, f'{model_name}.log')) # copy config to guarantee reproducibility diff --git a/.dev_scripts/test_benchmark.sh b/.dev_scripts/test_benchmark.sh index f020d7ff9e..6223fb1303 100644 --- a/.dev_scripts/test_benchmark.sh +++ b/.dev_scripts/test_benchmark.sh @@ -25,11 +25,11 @@ GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_test.sh $PARTITION fcos3d_r $CHECKPOINT_DIR/configs/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d.py/latest.pth --eval map \ 2>&1|tee $CHECKPOINT_DIR/configs/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d.py/FULL_LOG.txt & -echo 'configs/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py' & -mkdir -p $CHECKPOINT_DIR/configs/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py -GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_test.sh $PARTITION hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class configs/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py \ -$CHECKPOINT_DIR/configs/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py/latest.pth --eval map \ -2>&1|tee $CHECKPOINT_DIR/configs/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py/FULL_LOG.txt & +echo 'configs/second/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py' & +mkdir -p $CHECKPOINT_DIR/configs/second/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py +GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_test.sh $PARTITION hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class configs/second/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py \ +$CHECKPOINT_DIR/configs/second/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py/latest.pth --eval map \ +2>&1|tee $CHECKPOINT_DIR/configs/second/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py/FULL_LOG.txt & echo 'configs/free_anchor/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d.py' & mkdir -p $CHECKPOINT_DIR/configs/free_anchor/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d.py diff --git a/.dev_scripts/train_benchmark.sh b/.dev_scripts/train_benchmark.sh index 7655ab174e..5d3da9aa01 100644 --- a/.dev_scripts/train_benchmark.sh +++ b/.dev_scripts/train_benchmark.sh @@ -25,11 +25,11 @@ GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_train.sh $PARTITION fcos3d_ $CHECKPOINT_DIR/configs/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d.py --cfg-options checkpoint_config.max_keep_ckpts=1 \ 2>&1|tee $CHECKPOINT_DIR/configs/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d.py/FULL_LOG.txt & -echo 'configs/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py' & -mkdir -p $CHECKPOINT_DIR/configs/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py -GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_train.sh $PARTITION hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class configs/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py \ -$CHECKPOINT_DIR/configs/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py --cfg-options checkpoint_config.max_keep_ckpts=1 \ -2>&1|tee $CHECKPOINT_DIR/configs/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py/FULL_LOG.txt & +echo 'configs/second/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py' & +mkdir -p $CHECKPOINT_DIR/configs/second/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py +GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_train.sh $PARTITION hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class configs/second/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py \ +$CHECKPOINT_DIR/configs/second/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py --cfg-options checkpoint_config.max_keep_ckpts=1 \ +2>&1|tee $CHECKPOINT_DIR/configs/second/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py/FULL_LOG.txt & echo 'configs/free_anchor/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d.py' & mkdir -p $CHECKPOINT_DIR/configs/free_anchor/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d.py diff --git a/.github/ISSUE_TEMPLATE/error-report.md b/.github/ISSUE_TEMPLATE/error-report.md index 5ec0318463..703a7d44b7 100644 --- a/.github/ISSUE_TEMPLATE/error-report.md +++ b/.github/ISSUE_TEMPLATE/error-report.md @@ -28,7 +28,7 @@ A placeholder for the command. **Environment** -1. Please run `python mmdet3d/utils/collect_env.py` to collect necessary environment infomation and paste it here. +1. Please run `python mmdet3d/utils/collect_env.py` to collect necessary environment information and paste it here. 2. You may add addition that may be helpful for locating the problem, such as - How you installed PyTorch [e.g., pip, conda, source] - Other environment variables that may be related (such as `$PATH`, `$LD_LIBRARY_PATH`, `$PYTHONPATH`, etc.) diff --git a/.github/ISSUE_TEMPLATE/reimplementation_questions.md b/.github/ISSUE_TEMPLATE/reimplementation_questions.md index 8e36e80912..9c17fb6794 100644 --- a/.github/ISSUE_TEMPLATE/reimplementation_questions.md +++ b/.github/ISSUE_TEMPLATE/reimplementation_questions.md @@ -46,7 +46,7 @@ A placeholder for the config. **Environment** -1. Please run `python mmdet3d/utils/collect_env.py` to collect necessary environment infomation and paste it here. +1. Please run `python mmdet3d/utils/collect_env.py` to collect necessary environment information and paste it here. 2. You may add addition that may be helpful for locating the problem, such as - How you installed PyTorch [e.g., pip, conda, source] - Other environment variables that may be related (such as `$PATH`, `$LD_LIBRARY_PATH`, `$PYTHONPATH`, etc.) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8a4b61cb77..5bd88c5a27 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,12 +3,8 @@ repos: rev: 3.8.3 hooks: - id: flake8 - - repo: https://github.com/asottile/seed-isort-config - rev: v2.2.0 - hooks: - - id: seed-isort-config - - repo: https://github.com/timothycrosley/isort - rev: 5.0.2 + - repo: https://github.com/PyCQA/isort + rev: 5.10.1 hooks: - id: isort - repo: https://github.com/pre-commit/mirrors-yapf @@ -43,3 +39,9 @@ repos: hooks: - id: docformatter args: ["--in-place", "--wrap-descriptions", "79"] + - repo: https://github.com/open-mmlab/pre-commit-hooks + rev: v0.2.0 # Use the ref you want to point at + hooks: + - id: check-algo-readme + - id: check-copyright + args: ["mmdet3d"] # replace the dir_to_check with your expected directory to check diff --git a/README.md b/README.md index 0080bbcd39..178c20ae0c 100644 --- a/README.md +++ b/README.md @@ -25,16 +25,12 @@ [![license](https://img.shields.io/github/license/open-mmlab/mmdetection3d.svg)](https://github.com/open-mmlab/mmdetection3d/blob/master/LICENSE) -**News**: We released the codebase v0.18.1. +**News**: We released the codebase v1.0.0rc0. -In addition, we have preliminarily supported several new models on the [v1.0.0.dev0](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0) branch, including [DGCNN](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/configs/dgcnn/README.md), [SMOKE](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/configs/smoke/README.md) and [PGD](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/configs/pgd/README.md). - -Note: We are going through large refactoring to provide simpler and more unified usage of many modules. Thus, few features will be added to the master branch in the following months. +Note: We are going through large refactoring to provide simpler and more unified usage of many modules. The compatibilities of models are broken due to the unification and simplification of coordinate systems. For now, most models are benchmarked with similar performance, though few models are still being benchmarked. -You can start experiments with [v1.0.0.dev0](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0) if you are interested. Please note that our new features will only be supported in v1.0.0 branch afterward. - In the [nuScenes 3D detection challenge](https://www.nuscenes.org/object-detection?externalData=all&mapData=all&modalities=Any) of the 5th AI Driving Olympics in NeurIPS 2020, we obtained the best PKL award and the second runner-up by multi-modality entry, and the best vision-only results. Code and models for the best vision-only method, [FCOS3D](https://arxiv.org/abs/2104.10956), have been released. Please stay tuned for [MoCa](https://arxiv.org/abs/2012.12741). @@ -87,11 +83,9 @@ This project is released under the [Apache 2.0 license](LICENSE). ## Changelog -v0.18.1 was released in 1/2/2022. +v1.0.0rc0 was released in 18/2/2022. Please refer to [changelog.md](docs/en/changelog.md) for details and release history. -For branch v1.0.0.dev0, please refer to [changelog_v1.0.md](https://github.com/Tai-Wang/mmdetection3d/blob/v1.0.0.dev0-changelog/docs/changelog_v1.0.md) for our latest features and more details. - ## Benchmark and model zoo Supported methods and backbones are shown in the below table. @@ -102,6 +96,8 @@ Support backbones: - [x] PointNet (CVPR'2017) - [x] PointNet++ (NeurIPS'2017) - [x] RegNet (CVPR'2020) +- [x] DGCNN (TOG'2019) +- [x] DLA (CVPR'2018) Support methods @@ -121,25 +117,31 @@ Support methods - [x] [Group-Free-3D (ICCV'2021)](configs/groupfree3d/README.md) - [x] [ImVoxelNet (WACV'2022)](configs/imvoxelnet/README.md) - [x] [PAConv (CVPR'2021)](configs/paconv/README.md) - -| | ResNet | ResNeXt | SENet |PointNet++ | HRNet | RegNetX | Res2Net | -|--------------------|:--------:|:--------:|:--------:|:---------:|:-----:|:--------:|:-----:| -| SECOND | ☐ | ☐ | ☐ | ✗ | ☐ | ✓ | ☐ | -| PointPillars | ☐ | ☐ | ☐ | ✗ | ☐ | ✓ | ☐ | -| FreeAnchor | ☐ | ☐ | ☐ | ✗ | ☐ | ✓ | ☐ | -| VoteNet | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | -| H3DNet | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | -| 3DSSD | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | -| Part-A2 | ☐ | ☐ | ☐ | ✗ | ☐ | ✓ | ☐ | -| MVXNet | ☐ | ☐ | ☐ | ✗ | ☐ | ✓ | ☐ | -| CenterPoint | ☐ | ☐ | ☐ | ✗ | ☐ | ✓ | ☐ | -| SSN | ☐ | ☐ | ☐ | ✗ | ☐ | ✓ | ☐ | -| ImVoteNet | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | -| FCOS3D | ✓ | ☐ | ☐ | ✗ | ☐ | ☐ | ☐ | -| PointNet++ | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | -| Group-Free-3D | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | -| ImVoxelNet | ✓ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | -| PAConv | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | +- [x] [DGCNN (TOG'2019)](configs/dgcnn/README.md) +- [x] [SMOKE (CVPRW'2020)](configs/smoke/README.md) +- [x] [PGD (CoRL'2021)](configs/pgd/README.md) + +| | ResNet | ResNeXt | SENet |PointNet++ |DGCNN | HRNet | RegNetX | Res2Net | DLA | +|--------------------|:--------:|:--------:|:--------:|:---------:|:---------:|:-----:|:--------:|:-----:|:---:| +| SECOND | ☐ | ☐ | ☐ | ✗ | ✗ | ☐ | ✓ | ☐ | ✗ +| PointPillars | ☐ | ☐ | ☐ | ✗ | ✗ | ☐ | ✓ | ☐ | ✗ +| FreeAnchor | ☐ | ☐ | ☐ | ✗ | ✗ | ☐ | ✓ | ☐ | ✗ +| VoteNet | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | ✗ | ✗ +| H3DNet | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | ✗ | ✗ +| 3DSSD | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | ✗ | ✗ +| Part-A2 | ☐ | ☐ | ☐ | ✗ | ✗ | ☐ | ✓ | ☐ | ✗ +| MVXNet | ☐ | ☐ | ☐ | ✗ | ✗ | ☐ | ✓ | ☐ | ✗ +| CenterPoint | ☐ | ☐ | ☐ | ✗ | ✗ | ☐ | ✓ | ☐ | ✗ +| SSN | ☐ | ☐ | ☐ | ✗ | ✗ | ☐ | ✓ | ☐ | ✗ +| ImVoteNet | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | ✗ | ✗ +| FCOS3D | ✓ | ☐ | ☐ | ✗ | ✗ | ☐ | ☐ | ☐ | ✗ +| PointNet++ | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | ✗ | ✗ +| Group-Free-3D | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | ✗ | ✗ +| ImVoxelNet | ✓ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ +| PAConv | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | ✗ | ✗ +| DGCNN | ✗ | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | ✗ +| SMOKE | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✓ +| PGD | ✓ | ☐ | ☐ | ✗ | ✗ | ☐ | ☐ | ☐ | ✗ Other features - [x] [Dynamic Voxelization](configs/dynamic_voxelization/README.md) diff --git a/README_zh-CN.md b/README_zh-CN.md index e4931c2a74..85fa01f3de 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -25,16 +25,12 @@ [![license](https://img.shields.io/github/license/open-mmlab/mmdetection3d.svg)](https://github.com/open-mmlab/mmdetection3d/blob/master/LICENSE) -**新闻**: 我们发布了版本 v0.18.1. +**新闻**: 我们发布了版本 v1.0.0rc0. -另外,我们在 [v1.0.0.dev0](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0) 分支初步支持了多个新模型,包括 [DGCNN](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/configs/dgcnn/README.md), [SMOKE](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/configs/smoke/README.md) 和 [PGD](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/configs/pgd/README.md)。 - -说明:我们正在进行大规模的重构,以提供对许多模块更简单、更统一的使用。因此,在接下来的几个月里,很少有功能会添加到主分支中。 +说明:我们正在进行大规模的重构,以提供对许多模块更简单、更统一的使用。 由于坐标系的统一和简化,模型的兼容性会受到影响。目前,大多数模型都以类似的性能对齐了精度,但仍有少数模型在进行基准测试。 -如果您感兴趣,可以开始使用 [v1.0.0.dev0](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0) 分支进行实验。请注意,我们的新功能将只支持在 v1.0.0 分支。 - 在第三届 [nuScenes 3D 检测挑战赛](https://www.nuscenes.org/object-detection?externalData=all&mapData=all&modalities=Any)(第五届 AI Driving Olympics, NeurIPS 2020)中,我们获得了最佳 PKL 奖、第三名和最好的纯视觉的结果,相关的代码和模型将会在不久后发布。 最好的纯视觉方法 [FCOS3D](https://arxiv.org/abs/2104.10956) 的代码和模型已经发布。请继续关注我们的多模态检测器 [MoCa](https://arxiv.org/abs/2012.12741)。 @@ -87,11 +83,9 @@ MMDetection3D 是一个基于 PyTorch 的目标检测开源工具箱, 下一代 ## 更新日志 -最新的版本 v0.18.1 在 2022.2.1 发布。 +最新的版本 v1.0.0rc0 在 2022.2.18 发布。 如果想了解更多版本更新细节和历史信息,请阅读[更新日志](docs/zh_cn/changelog.md)。 -对于分支 [v1.0.0.dev0](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0) ,请参考 [v1.0 更新日志](https://github.com/Tai-Wang/mmdetection3d/blob/v1.0.0.dev0-changelog/docs/changelog_v1.0.md) 来了解我们的最新功能和更多细节。 - ## 基准测试和模型库 测试结果和模型可以在[模型库](docs/zh_cn/model_zoo.md)中找到。 @@ -101,6 +95,8 @@ MMDetection3D 是一个基于 PyTorch 的目标检测开源工具箱, 下一代 - [x] PointNet (CVPR'2017) - [x] PointNet++ (NeurIPS'2017) - [x] RegNet (CVPR'2020) +- [x] DGCNN (TOG'2019) +- [x] DLA (CVPR'2018) 已支持的算法: @@ -120,25 +116,31 @@ MMDetection3D 是一个基于 PyTorch 的目标检测开源工具箱, 下一代 - [x] [Group-Free-3D (ICCV'2021)](configs/groupfree3d/README.md) - [x] [ImVoxelNet (WACV'2022)](configs/imvoxelnet/README.md) - [x] [PAConv (CVPR'2021)](configs/paconv/README.md) - -| | ResNet | ResNeXt | SENet |PointNet++ | HRNet | RegNetX | Res2Net | -|--------------------|:--------:|:--------:|:--------:|:---------:|:-----:|:--------:|:-----:| -| SECOND | ☐ | ☐ | ☐ | ✗ | ☐ | ✓ | ☐ | -| PointPillars | ☐ | ☐ | ☐ | ✗ | ☐ | ✓ | ☐ | -| FreeAnchor | ☐ | ☐ | ☐ | ✗ | ☐ | ✓ | ☐ | -| VoteNet | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | -| H3DNet | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | -| 3DSSD | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | -| Part-A2 | ☐ | ☐ | ☐ | ✗ | ☐ | ✓ | ☐ | -| MVXNet | ☐ | ☐ | ☐ | ✗ | ☐ | ✓ | ☐ | -| CenterPoint | ☐ | ☐ | ☐ | ✗ | ☐ | ✓ | ☐ | -| SSN | ☐ | ☐ | ☐ | ✗ | ☐ | ✓ | ☐ | -| ImVoteNet | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | -| FCOS3D | ✓ | ☐ | ☐ | ✗ | ☐ | ☐ | ☐ | -| PointNet++ | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | -| Group-Free-3D | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | -| ImVoxelNet | ✓ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | -| PAConv | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | +- [x] [DGCNN (TOG'2019)](configs/dgcnn/README.md) +- [x] [SMOKE (CVPRW'2020)](configs/smoke/README.md) +- [x] [PGD (CoRL'2021)](configs/pgd/README.md) + +| | ResNet | ResNeXt | SENet |PointNet++ |DGCNN | HRNet | RegNetX | Res2Net | DLA | +|--------------------|:--------:|:--------:|:--------:|:---------:|:---------:|:-----:|:--------:|:-----:|:---:| +| SECOND | ☐ | ☐ | ☐ | ✗ | ✗ | ☐ | ✓ | ☐ | ✗ +| PointPillars | ☐ | ☐ | ☐ | ✗ | ✗ | ☐ | ✓ | ☐ | ✗ +| FreeAnchor | ☐ | ☐ | ☐ | ✗ | ✗ | ☐ | ✓ | ☐ | ✗ +| VoteNet | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | ✗ | ✗ +| H3DNet | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | ✗ | ✗ +| 3DSSD | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | ✗ | ✗ +| Part-A2 | ☐ | ☐ | ☐ | ✗ | ✗ | ☐ | ✓ | ☐ | ✗ +| MVXNet | ☐ | ☐ | ☐ | ✗ | ✗ | ☐ | ✓ | ☐ | ✗ +| CenterPoint | ☐ | ☐ | ☐ | ✗ | ✗ | ☐ | ✓ | ☐ | ✗ +| SSN | ☐ | ☐ | ☐ | ✗ | ✗ | ☐ | ✓ | ☐ | ✗ +| ImVoteNet | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | ✗ | ✗ +| FCOS3D | ✓ | ☐ | ☐ | ✗ | ✗ | ☐ | ☐ | ☐ | ✗ +| PointNet++ | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | ✗ | ✗ +| Group-Free-3D | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | ✗ | ✗ +| ImVoxelNet | ✓ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ +| PAConv | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | ✗ | ✗ +| DGCNN | ✗ | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | ✗ +| SMOKE | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✓ +| PGD | ✓ | ☐ | ☐ | ✗ | ✗ | ☐ | ☐ | ☐ | ✗ 其他特性 - [x] [Dynamic Voxelization](configs/dynamic_voxelization/README.md) diff --git a/configs/3dssd/README.md b/configs/3dssd/README.md index 5d3c0a9aeb..579ed25cd3 100644 --- a/configs/3dssd/README.md +++ b/configs/3dssd/README.md @@ -1,21 +1,20 @@ # 3DSSD: Point-based 3D Single Stage Object Detector -## Introduction +> [3DSSD: Point-based 3D Single Stage Object Detector](https://arxiv.org/abs/2002.10187) -We implement 3DSSD and provide the results and checkpoints on KITTI datasets. +## Abstract -``` -@inproceedings{yang20203dssd, - author = {Zetong Yang and Yanan Sun and Shu Liu and Jiaya Jia}, - title = {3DSSD: Point-based 3D Single Stage Object Detector}, - booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, - year = {2020} -} -``` +Currently, there have been many kinds of voxel-based 3D single stage detectors, while point-based single stage methods are still underexplored. In this paper, we first present a lightweight and effective point-based 3D single stage object detector, named 3DSSD, achieving a good balance between accuracy and efficiency. In this paradigm, all upsampling layers and refinement stage, which are indispensable in all existing point-based methods, are abandoned to reduce the large computation cost. We novelly propose a fusion sampling strategy in downsampling process to make detection on less representative points feasible. A delicate box prediction network including a candidate generation layer, an anchor-free regression head with a 3D center-ness assignment strategy is designed to meet with our demand of accuracy and speed. Our paradigm is an elegant single stage anchor-free framework, showing great superiority to other existing methods. We evaluate 3DSSD on widely used KITTI dataset and more challenging nuScenes dataset. Our method outperforms all state-of-the-art voxel-based single stage methods by a large margin, and has comparable performance to two stage point-based methods as well, with inference speed more than 25 FPS, 2x faster than former state-of-the-art point-based methods. + +
+ +
-### Experiment details on KITTI datasets +## Introduction + +We implement 3DSSD and provide the results and checkpoints on KITTI datasets. Some settings in our implementation are different from the [official implementation](https://github.com/Jia-Research-Lab/3DSSD), which bring marginal differences to the performance on KITTI datasets in our experiments. To simplify and unify the models of our implementation, we skip them in our models. These differences are listed as below: 1. We keep the scenes without any object while the official code skips these scenes in training. In the official implementation, only 3229 and 3394 samples are used as training and validation sets, respectively. In our implementation, we keep using 3712 and 3769 samples as training and validation sets, respectively, as those used for all the other models in our implementation on KITTI datasets. @@ -23,7 +22,7 @@ Some settings in our implementation are different from the [official implementat 3. While using [`DataBaseSampler`](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/pipelines/dbsampler.py#L80) for data augmentation, the official code uses road planes as reference to place the sampled objects while we do not. 4. We perform detection using LIDAR coordinates while the official code uses camera coordinates. -## Results +## Results and models ### KITTI @@ -31,4 +30,15 @@ Some settings in our implementation are different from the [official implementat | :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: | | [PointNet2SAMSG](./3dssd_4x4_kitti-3d-car.py)| Car |72e|4.7||78.69(81.27)1|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/3dssd/3dssd_kitti-3d-car_20210602_124438-b4276f56.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/3dssd/3dssd_kitti-3d-car_20210602_124438.log.json)| -[1]: We report two different 3D object detection performance here. 78.69mAP is evaluated by our evaluation code and 81.27mAP is evaluated by the official development kit (so as that used in the paper and official code of 3DSSD ). We found that the commonly used Python implementation of [`rotate_iou`](https://github.com/traveller59/second.pytorch/blob/e42e4a0e17262ab7d180ee96a0a36427f2c20a44/second/core/non_max_suppression/nms_gpu.py#L605) which is used in our KITTI dataset evaluation, is different from the official implemention in [KITTI benchmark](http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d). +[1]: We report two different 3D object detection performance here. 78.69mAP is evaluated by our evaluation code and 81.27mAP is evaluated by the official development kit (so as that used in the paper and official code of 3DSSD ). We found that the commonly used Python implementation of [`rotate_iou`](https://github.com/traveller59/second.pytorch/blob/e42e4a0e17262ab7d180ee96a0a36427f2c20a44/second/core/non_max_suppression/nms_gpu.py#L605) which is used in our KITTI dataset evaluation, is different from the official implementation in [KITTI benchmark](http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d). + +## Citation + +```latex +@inproceedings{yang20203dssd, + author = {Zetong Yang and Yanan Sun and Shu Liu and Jiaya Jia}, + title = {3DSSD: Point-based 3D Single Stage Object Detector}, + booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + year = {2020} +} +``` diff --git a/configs/_base_/datasets/kitti-mono3d.py b/configs/_base_/datasets/kitti-mono3d.py new file mode 100644 index 0000000000..5817dc7069 --- /dev/null +++ b/configs/_base_/datasets/kitti-mono3d.py @@ -0,0 +1,92 @@ +dataset_type = 'KittiMonoDataset' +data_root = 'data/kitti/' +class_names = ['Pedestrian', 'Cyclist', 'Car'] +input_modality = dict(use_lidar=False, use_camera=True) +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFileMono3D'), + dict( + type='LoadAnnotations3D', + with_bbox=True, + with_label=True, + with_attr_label=False, + with_bbox_3d=True, + with_label_3d=True, + with_bbox_depth=True), + dict(type='Resize', img_scale=(1242, 375), keep_ratio=True), + dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict( + type='Collect3D', + keys=[ + 'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_3d', 'gt_labels_3d', + 'centers2d', 'depths' + ]), +] +test_pipeline = [ + dict(type='LoadImageFromFileMono3D'), + dict( + type='MultiScaleFlipAug', + img_scale=(1242, 375), + flip=False, + transforms=[ + dict(type='RandomFlip3D'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['img']), + ]) +] +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) +eval_pipeline = [ + dict(type='LoadImageFromFileMono3D'), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['img']) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'kitti_infos_train_mono3d.coco.json', + info_file=data_root + 'kitti_infos_train.pkl', + img_prefix=data_root, + classes=class_names, + pipeline=train_pipeline, + modality=input_modality, + test_mode=False, + box_type_3d='Camera'), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'kitti_infos_val_mono3d.coco.json', + info_file=data_root + 'kitti_infos_val.pkl', + img_prefix=data_root, + classes=class_names, + pipeline=test_pipeline, + modality=input_modality, + test_mode=True, + box_type_3d='Camera'), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'kitti_infos_val_mono3d.coco.json', + info_file=data_root + 'kitti_infos_val.pkl', + img_prefix=data_root, + classes=class_names, + pipeline=test_pipeline, + modality=input_modality, + test_mode=True, + box_type_3d='Camera')) +evaluation = dict(interval=2) diff --git a/configs/_base_/default_runtime.py b/configs/_base_/default_runtime.py index 4e85b69abe..5fc198bb1d 100644 --- a/configs/_base_/default_runtime.py +++ b/configs/_base_/default_runtime.py @@ -16,3 +16,8 @@ load_from = None resume_from = None workflow = [('train', 1)] + +# disable opencv multithreading to avoid system being overloaded +opencv_num_threads = 0 +# set multi-process start method as `fork` to speed up the training +mp_start_method = 'fork' diff --git a/configs/_base_/models/dgcnn.py b/configs/_base_/models/dgcnn.py new file mode 100644 index 0000000000..61e7272692 --- /dev/null +++ b/configs/_base_/models/dgcnn.py @@ -0,0 +1,28 @@ +# model settings +model = dict( + type='EncoderDecoder3D', + backbone=dict( + type='DGCNNBackbone', + in_channels=9, # [xyz, rgb, normal_xyz], modified with dataset + num_samples=(20, 20, 20), + knn_modes=('D-KNN', 'F-KNN', 'F-KNN'), + radius=(None, None, None), + gf_channels=((64, 64), (64, 64), (64, )), + fa_channels=(1024, ), + act_cfg=dict(type='LeakyReLU', negative_slope=0.2)), + decode_head=dict( + type='DGCNNHead', + fp_channels=(1216, 512), + channels=256, + dropout_ratio=0.5, + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d'), + act_cfg=dict(type='LeakyReLU', negative_slope=0.2), + loss_decode=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + class_weight=None, # modified with dataset + loss_weight=1.0)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='slide')) diff --git a/configs/_base_/models/fcos3d.py b/configs/_base_/models/fcos3d.py index 92ea907605..be83001d8f 100644 --- a/configs/_base_/models/fcos3d.py +++ b/configs/_base_/models/fcos3d.py @@ -1,6 +1,5 @@ model = dict( type='FCOSMono3D', - pretrained='open-mmlab://detectron2/resnet101_caffe', backbone=dict( type='ResNet', depth=101, @@ -9,7 +8,10 @@ frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=False), norm_eval=True, - style='caffe'), + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet101_caffe')), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], @@ -29,6 +31,7 @@ pred_attrs=True, pred_velo=True, dir_offset=0.7854, # pi/4 + dir_limit_offset=0, strides=[8, 16, 32, 64, 128], group_reg_dims=(2, 1, 3, 1, 2), # offset, depth, size, rot, velo cls_branch=(256, ), @@ -54,6 +57,7 @@ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_centerness=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + bbox_coder=dict(type='FCOS3DBBoxCoder', code_size=9), norm_on_bbox=True, centerness_on_reg=True, center_sampling=True, diff --git a/configs/_base_/models/hv_pointpillars_fpn_nus.py b/configs/_base_/models/hv_pointpillars_fpn_nus.py index e153f6c6e6..be29269ded 100644 --- a/configs/_base_/models/hv_pointpillars_fpn_nus.py +++ b/configs/_base_/models/hv_pointpillars_fpn_nus.py @@ -49,8 +49,8 @@ ranges=[[-50, -50, -1.8, 50, 50, -1.8]], scales=[1, 2, 4], sizes=[ - [0.8660, 2.5981, 1.], # 1.5/sqrt(3) - [0.5774, 1.7321, 1.], # 1/sqrt(3) + [2.5981, 0.8660, 1.], # 1.5 / sqrt(3) + [1.7321, 0.5774, 1.], # 1 / sqrt(3) [1., 1., 1.], [0.4, 0.4, 1], ], @@ -59,8 +59,7 @@ reshape_out=True), assigner_per_size=False, diff_rad_by_sin=True, - dir_offset=0.7854, # pi/4 - dir_limit_offset=0, + dir_offset=-0.7854, # -pi / 4 bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9), loss_cls=dict( type='FocalLoss', diff --git a/configs/_base_/models/hv_pointpillars_secfpn_kitti.py b/configs/_base_/models/hv_pointpillars_secfpn_kitti.py index 65e727daa0..ac46475d6e 100644 --- a/configs/_base_/models/hv_pointpillars_secfpn_kitti.py +++ b/configs/_base_/models/hv_pointpillars_secfpn_kitti.py @@ -34,6 +34,7 @@ in_channels=384, feat_channels=384, use_direction_classifier=True, + assign_per_class=True, anchor_generator=dict( type='AlignedAnchor3DRangeGenerator', ranges=[ @@ -41,7 +42,7 @@ [0, -39.68, -0.6, 69.12, 39.68, -0.6], [0, -39.68, -1.78, 69.12, 39.68, -1.78], ], - sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]], + sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]], rotations=[0, 1.57], reshape_out=False), diff_rad_by_sin=True, diff --git a/configs/_base_/models/hv_pointpillars_secfpn_waymo.py b/configs/_base_/models/hv_pointpillars_secfpn_waymo.py index 14873ead47..30e23e9560 100644 --- a/configs/_base_/models/hv_pointpillars_secfpn_waymo.py +++ b/configs/_base_/models/hv_pointpillars_secfpn_waymo.py @@ -48,15 +48,14 @@ [-74.88, -74.88, -0.1188, 74.88, 74.88, -0.1188], [-74.88, -74.88, 0, 74.88, 74.88, 0]], sizes=[ - [2.08, 4.73, 1.77], # car - [0.84, 1.81, 1.77], # cyclist - [0.84, 0.91, 1.74] # pedestrian + [4.73, 2.08, 1.77], # car + [1.81, 0.84, 1.77], # cyclist + [0.91, 0.84, 1.74] # pedestrian ], rotations=[0, 1.57], reshape_out=False), diff_rad_by_sin=True, - dir_offset=0.7854, # pi/4 - dir_limit_offset=0, + dir_offset=-0.7854, # -pi / 4 bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7), loss_cls=dict( type='FocalLoss', diff --git a/configs/_base_/models/hv_second_secfpn_kitti.py b/configs/_base_/models/hv_second_secfpn_kitti.py index 6bf18abe1d..e7d569a527 100644 --- a/configs/_base_/models/hv_second_secfpn_kitti.py +++ b/configs/_base_/models/hv_second_secfpn_kitti.py @@ -37,7 +37,7 @@ [0, -40.0, -0.6, 70.4, 40.0, -0.6], [0, -40.0, -1.78, 70.4, 40.0, -1.78], ], - sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]], + sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]], rotations=[0, 1.57], reshape_out=False), diff_rad_by_sin=True, diff --git a/configs/_base_/models/hv_second_secfpn_waymo.py b/configs/_base_/models/hv_second_secfpn_waymo.py index eb9bd3ae5c..0fa39e1505 100644 --- a/configs/_base_/models/hv_second_secfpn_waymo.py +++ b/configs/_base_/models/hv_second_secfpn_waymo.py @@ -42,15 +42,14 @@ [-76.8, -51.2, 0, 76.8, 51.2, 0], [-76.8, -51.2, -0.1188, 76.8, 51.2, -0.1188]], sizes=[ - [2.08, 4.73, 1.77], # car - [0.84, 0.91, 1.74], # pedestrian - [0.84, 1.81, 1.77] # cyclist + [4.73, 2.08, 1.77], # car + [0.91, 0.84, 1.74], # pedestrian + [1.81, 0.84, 1.77] # cyclist ], rotations=[0, 1.57], reshape_out=False), diff_rad_by_sin=True, - dir_offset=0.7854, # pi/4 - dir_limit_offset=0, + dir_offset=-0.7854, # -pi / 4 bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7), loss_cls=dict( type='FocalLoss', diff --git a/configs/_base_/models/parta2.py b/configs/_base_/models/parta2.py index 6c5ae9a663..aa1556789f 100644 --- a/configs/_base_/models/parta2.py +++ b/configs/_base_/models/parta2.py @@ -38,7 +38,7 @@ ranges=[[0, -40.0, -0.6, 70.4, 40.0, -0.6], [0, -40.0, -0.6, 70.4, 40.0, -0.6], [0, -40.0, -1.78, 70.4, 40.0, -1.78]], - sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]], + sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]], rotations=[0, 1.57], reshape_out=False), diff_rad_by_sin=True, diff --git a/configs/_base_/models/pgd.py b/configs/_base_/models/pgd.py new file mode 100644 index 0000000000..e63fc1fceb --- /dev/null +++ b/configs/_base_/models/pgd.py @@ -0,0 +1,55 @@ +_base_ = './fcos3d.py' +# model settings +model = dict( + bbox_head=dict( + _delete_=True, + type='PGDHead', + num_classes=10, + in_channels=256, + stacked_convs=2, + feat_channels=256, + use_direction_classifier=True, + diff_rad_by_sin=True, + pred_attrs=True, + pred_velo=True, + pred_bbox2d=True, + pred_keypoints=False, + dir_offset=0.7854, # pi/4 + strides=[8, 16, 32, 64, 128], + group_reg_dims=(2, 1, 3, 1, 2), # offset, depth, size, rot, velo + cls_branch=(256, ), + reg_branch=( + (256, ), # offset + (256, ), # depth + (256, ), # size + (256, ), # rot + () # velo + ), + dir_branch=(256, ), + attr_branch=(256, ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), + loss_dir=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_attr=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + norm_on_bbox=True, + centerness_on_reg=True, + center_sampling=True, + conv_bias=True, + dcn_on_last_conv=True, + use_depth_classifier=True, + depth_branch=(256, ), + depth_range=(0, 50), + depth_unit=10, + division='uniform', + depth_bins=6, + bbox_coder=dict(type='PGDBBoxCoder', code_size=9)), + test_cfg=dict(nms_pre=1000, nms_thr=0.8, score_thr=0.01, max_per_img=200)) diff --git a/configs/_base_/models/point_rcnn.py b/configs/_base_/models/point_rcnn.py new file mode 100644 index 0000000000..02a1414f7d --- /dev/null +++ b/configs/_base_/models/point_rcnn.py @@ -0,0 +1,131 @@ +model = dict( + type='PointRCNN', + backbone=dict( + type='PointNet2SAMSG', + in_channels=4, + num_points=(4096, 1024, 256, 64), + radii=((0.1, 0.5), (0.5, 1.0), (1.0, 2.0), (2.0, 4.0)), + num_samples=((16, 32), (16, 32), (16, 32), (16, 32)), + sa_channels=(((16, 16, 32), (32, 32, 64)), ((64, 64, 128), (64, 96, + 128)), + ((128, 196, 256), (128, 196, 256)), ((256, 256, 512), + (256, 384, 512))), + fps_mods=(('D-FPS'), ('D-FPS'), ('D-FPS'), ('D-FPS')), + fps_sample_range_lists=((-1), (-1), (-1), (-1)), + aggregation_channels=(None, None, None, None), + dilated_group=(False, False, False, False), + out_indices=(0, 1, 2, 3), + norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1), + sa_cfg=dict( + type='PointSAModuleMSG', + pool_mod='max', + use_xyz=True, + normalize_xyz=False)), + neck=dict( + type='PointNetFPNeck', + fp_channels=((1536, 512, 512), (768, 512, 512), (608, 256, 256), + (257, 128, 128))), + rpn_head=dict( + type='PointRPNHead', + num_classes=3, + enlarge_width=0.1, + pred_layer_cfg=dict( + in_channels=128, + cls_linear_channels=(256, 256), + reg_linear_channels=(256, 256)), + cls_loss=dict( + type='FocalLoss', + use_sigmoid=True, + reduction='sum', + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + bbox_loss=dict( + type='SmoothL1Loss', + beta=1.0 / 9.0, + reduction='sum', + loss_weight=1.0), + bbox_coder=dict( + type='PointXYZWHLRBBoxCoder', + code_size=8, + # code_size: (center residual (3), size regression (3), + # torch.cos(yaw) (1), torch.sin(yaw) (1) + use_mean_size=True, + mean_size=[[3.9, 1.6, 1.56], [0.8, 0.6, 1.73], [1.76, 0.6, + 1.73]])), + roi_head=dict( + type='PointRCNNRoIHead', + point_roi_extractor=dict( + type='Single3DRoIPointExtractor', + roi_layer=dict(type='RoIPointPool3d', num_sampled_points=512)), + bbox_head=dict( + type='PointRCNNBboxHead', + num_classes=1, + pred_layer_cfg=dict( + in_channels=512, + cls_conv_channels=(256, 256), + reg_conv_channels=(256, 256), + bias=True), + in_channels=5, + # 5 = 3 (xyz) + scores + depth + mlp_channels=[128, 128], + num_points=(128, 32, -1), + radius=(0.2, 0.4, 100), + num_samples=(16, 16, 16), + sa_channels=((128, 128, 128), (128, 128, 256), (256, 256, 512)), + with_corner_loss=True), + depth_normalizer=70.0), + # model training and testing settings + train_cfg=dict( + pos_distance_thr=10.0, + rpn=dict( + nms_cfg=dict( + use_rotate_nms=True, iou_thr=0.8, nms_pre=9000, nms_post=512), + score_thr=None), + rcnn=dict( + assigner=[ + dict( # for Car + type='MaxIoUAssigner', + iou_calculator=dict( + type='BboxOverlaps3D', coordinate='lidar'), + pos_iou_thr=0.55, + neg_iou_thr=0.55, + min_pos_iou=0.55, + ignore_iof_thr=-1, + match_low_quality=False), + dict( # for Pedestrian + type='MaxIoUAssigner', + iou_calculator=dict( + type='BboxOverlaps3D', coordinate='lidar'), + pos_iou_thr=0.55, + neg_iou_thr=0.55, + min_pos_iou=0.55, + ignore_iof_thr=-1, + match_low_quality=False), + dict( # for Cyclist + type='MaxIoUAssigner', + iou_calculator=dict( + type='BboxOverlaps3D', coordinate='lidar'), + pos_iou_thr=0.55, + neg_iou_thr=0.55, + min_pos_iou=0.55, + ignore_iof_thr=-1, + match_low_quality=False) + ], + sampler=dict( + type='IoUNegPiecewiseSampler', + num=128, + pos_fraction=0.5, + neg_piece_fractions=[0.8, 0.2], + neg_iou_piece_thrs=[0.55, 0.1], + neg_pos_ub=-1, + add_gt_as_proposals=False, + return_iou=True), + cls_pos_thr=0.7, + cls_neg_thr=0.25)), + test_cfg=dict( + rpn=dict( + nms_cfg=dict( + use_rotate_nms=True, iou_thr=0.85, nms_pre=9000, nms_post=512), + score_thr=None), + rcnn=dict(use_rotate_nms=True, nms_thr=0.1, score_thr=0.1))) diff --git a/configs/_base_/models/smoke.py b/configs/_base_/models/smoke.py new file mode 100644 index 0000000000..0a7452b430 --- /dev/null +++ b/configs/_base_/models/smoke.py @@ -0,0 +1,53 @@ +model = dict( + type='SMOKEMono3D', + backbone=dict( + type='DLANet', + depth=34, + in_channels=3, + norm_cfg=dict(type='GN', num_groups=32), + init_cfg=dict( + type='Pretrained', + checkpoint='http://dl.yf.io/dla/models/imagenet/dla34-ba72cf86.pth' + )), + neck=dict( + type='DLANeck', + in_channels=[16, 32, 64, 128, 256, 512], + start_level=2, + end_level=5, + norm_cfg=dict(type='GN', num_groups=32)), + bbox_head=dict( + type='SMOKEMono3DHead', + num_classes=3, + in_channels=64, + dim_channel=[3, 4, 5], + ori_channel=[6, 7], + stacked_convs=0, + feat_channels=64, + use_direction_classifier=False, + diff_rad_by_sin=False, + pred_attrs=False, + pred_velo=False, + dir_offset=0, + strides=None, + group_reg_dims=(8, ), + cls_branch=(256, ), + reg_branch=((256, ), ), + num_attrs=0, + bbox_code_size=7, + dir_branch=(), + attr_branch=(), + bbox_coder=dict( + type='SMOKECoder', + base_depth=(28.01, 16.32), + base_dims=((0.88, 1.73, 0.67), (1.78, 1.70, 0.58), (3.88, 1.63, + 1.53)), + code_size=7), + loss_cls=dict(type='GaussianFocalLoss', loss_weight=1.0), + loss_bbox=dict(type='L1Loss', reduction='sum', loss_weight=1 / 300), + loss_dir=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_attr=None, + conv_bias=True, + dcn_on_last_conv=False), + train_cfg=None, + test_cfg=dict(topK=100, local_maximum_kernel=3, max_per_img=100)) diff --git a/configs/_base_/schedules/cyclic_40e.py b/configs/_base_/schedules/cyclic_40e.py index 4a711acf4f..664986331e 100644 --- a/configs/_base_/schedules/cyclic_40e.py +++ b/configs/_base_/schedules/cyclic_40e.py @@ -5,7 +5,7 @@ # the learning rate will change from 0.0018 to 0.018, than go to 0.0018*1e-4 lr = 0.0018 # The optimizer follows the setting in SECOND.Pytorch, but here we use -# the offcial AdamW optimizer implemented by PyTorch. +# the official AdamW optimizer implemented by PyTorch. optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01) optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) # We use cyclic learning rate and momentum schedule following SECOND.Pytorch diff --git a/configs/_base_/schedules/seg_cosine_100e.py b/configs/_base_/schedules/seg_cosine_100e.py new file mode 100644 index 0000000000..3b75932b3a --- /dev/null +++ b/configs/_base_/schedules/seg_cosine_100e.py @@ -0,0 +1,8 @@ +# optimizer +# This schedule is mainly used on S3DIS dataset in segmentation task +optimizer = dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=None) +lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=1e-5) + +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=100) diff --git a/configs/benchmark/hv_PartA2_secfpn_4x8_cyclic_80e_pcdet_kitti-3d-3class.py b/configs/benchmark/hv_PartA2_secfpn_4x8_cyclic_80e_pcdet_kitti-3d-3class.py index 19862097a3..398a19cd2f 100644 --- a/configs/benchmark/hv_PartA2_secfpn_4x8_cyclic_80e_pcdet_kitti-3d-3class.py +++ b/configs/benchmark/hv_PartA2_secfpn_4x8_cyclic_80e_pcdet_kitti-3d-3class.py @@ -38,7 +38,7 @@ ranges=[[0, -40.0, -0.6, 70.4, 40.0, -0.6], [0, -40.0, -0.6, 70.4, 40.0, -0.6], [0, -40.0, -1.78, 70.4, 40.0, -1.78]], - sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]], + sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]], rotations=[0, 1.57], reshape_out=False), diff_rad_by_sin=True, diff --git a/configs/benchmark/hv_pointpillars_secfpn_3x8_100e_det3d_kitti-3d-car.py b/configs/benchmark/hv_pointpillars_secfpn_3x8_100e_det3d_kitti-3d-car.py index 42a31a2170..72c7372451 100644 --- a/configs/benchmark/hv_pointpillars_secfpn_3x8_100e_det3d_kitti-3d-car.py +++ b/configs/benchmark/hv_pointpillars_secfpn_3x8_100e_det3d_kitti-3d-car.py @@ -37,7 +37,7 @@ anchor_generator=dict( type='Anchor3DRangeGenerator', ranges=[[0, -39.68, -1.78, 69.12, 39.68, -1.78]], - sizes=[[1.6, 3.9, 1.56]], + sizes=[[3.9, 1.6, 1.56]], rotations=[0, 1.57], reshape_out=True), diff_rad_by_sin=True, diff --git a/configs/benchmark/hv_pointpillars_secfpn_4x8_80e_pcdet_kitti-3d-3class.py b/configs/benchmark/hv_pointpillars_secfpn_4x8_80e_pcdet_kitti-3d-3class.py index 76ddd69a06..02eed9fb18 100644 --- a/configs/benchmark/hv_pointpillars_secfpn_4x8_80e_pcdet_kitti-3d-3class.py +++ b/configs/benchmark/hv_pointpillars_secfpn_4x8_80e_pcdet_kitti-3d-3class.py @@ -48,7 +48,7 @@ [0, -40.0, -0.6, 70.4, 40.0, -0.6], [0, -40.0, -1.78, 70.4, 40.0, -1.78], ], - sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]], + sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]], rotations=[0, 1.57], reshape_out=False), diff_rad_by_sin=True, diff --git a/configs/benchmark/hv_second_secfpn_4x8_80e_pcdet_kitti-3d-3class.py b/configs/benchmark/hv_second_secfpn_4x8_80e_pcdet_kitti-3d-3class.py index 1f2b109bf8..d61a050fb1 100644 --- a/configs/benchmark/hv_second_secfpn_4x8_80e_pcdet_kitti-3d-3class.py +++ b/configs/benchmark/hv_second_secfpn_4x8_80e_pcdet_kitti-3d-3class.py @@ -39,7 +39,7 @@ [0, -40.0, -0.6, 70.4, 40.0, -0.6], [0, -40.0, -1.78, 70.4, 40.0, -1.78], ], - sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]], + sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]], rotations=[0, 1.57], reshape_out=False), diff_rad_by_sin=True, diff --git a/configs/centerpoint/README.md b/configs/centerpoint/README.md index 69d4cdf90a..76016cb789 100644 --- a/configs/centerpoint/README.md +++ b/configs/centerpoint/README.md @@ -1,9 +1,19 @@ # Center-based 3D Object Detection and Tracking -## Introduction +> [Center-based 3D Object Detection and Tracking](https://arxiv.org/abs/2006.11275) +## Abstract + +Three-dimensional objects are commonly represented as 3D boxes in a point-cloud. This representation mimics the well-studied image-based 2D bounding-box detection but comes with additional challenges. Objects in a 3D world do not follow any particular orientation, and box-based detectors have difficulties enumerating all orientations or fitting an axis-aligned bounding box to rotated objects. In this paper, we instead propose to represent, detect, and track 3D objects as points. Our framework, CenterPoint, first detects centers of objects using a keypoint detector and regresses to other attributes, including 3D size, 3D orientation, and velocity. In a second stage, it refines these estimates using additional point features on the object. In CenterPoint, 3D object tracking simplifies to greedy closest-point matching. The resulting detection and tracking algorithm is simple, efficient, and effective. CenterPoint achieved state-of-the-art performance on the nuScenes benchmark for both 3D detection and tracking, with 65.5 NDS and 63.8 AMOTA for a single model. On the Waymo Open Dataset, CenterPoint outperforms all previous single model method by a large margin and ranks first among all Lidar-only submissions. + +
+ +
+ +## Introduction + We implement CenterPoint and provide the result and checkpoints on nuScenes dataset. We follow the below style to name config files. Contributors are advised to follow the same style. @@ -27,15 +37,6 @@ We follow the below style to name config files. Contributors are advised to foll `{dataset}`: dataset like nus-3d, kitti-3d, lyft-3d, scannet-3d, sunrgbd-3d. We also indicate the number of classes we are using if there exist multiple settings, e.g., kitti-3d-3class and kitti-3d-car means training on KITTI dataset with 3 classes and single class, respectively. -``` -@article{yin2021center, - title={Center-based 3D Object Detection and Tracking}, - author={Yin, Tianwei and Zhou, Xingyi and Kr{\"a}henb{\"u}hl, Philipp}, - journal={CVPR}, - year={2021}, -} -``` - ## Usage ### Test time augmentation @@ -103,7 +104,7 @@ data = dict( ``` -## Results +## Results and models ### CenterPoint @@ -124,3 +125,14 @@ data = dict( |above w/o circle nms|pillar (0.2)|✗|✗| | |49.12|59.66|| |[SECFPN](./centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus.py)|pillar (0.2)|✓|✗| 4.6| |48.8 |59.67 |[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/centerpoint/centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus/centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus_20200930_103722-3bb135f2.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/centerpoint/centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus/centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus_20200930_103722.log.json)| |above w/ circle nms|pillar (0.2)|✓|✓| | |48.79|59.65|| + +## Citation + +```latex +@article{yin2021center, + title={Center-based 3D Object Detection and Tracking}, + author={Yin, Tianwei and Zhou, Xingyi and Kr{\"a}henb{\"u}hl, Philipp}, + journal={CVPR}, + year={2021}, +} +``` diff --git a/configs/centerpoint/centerpoint_0075voxel_second_secfpn_dcn_4x8_cyclic_tta_20e_nus.py b/configs/centerpoint/centerpoint_0075voxel_second_secfpn_dcn_4x8_cyclic_tta_20e_nus.py index 770a11c68c..cdbdf0600f 100644 --- a/configs/centerpoint/centerpoint_0075voxel_second_secfpn_dcn_4x8_cyclic_tta_20e_nus.py +++ b/configs/centerpoint/centerpoint_0075voxel_second_secfpn_dcn_4x8_cyclic_tta_20e_nus.py @@ -1,5 +1,7 @@ _base_ = './centerpoint_0075voxel_second_secfpn_dcn_4x8_cyclic_20e_nus.py' +model = dict(test_cfg=dict(pts=dict(use_rotate_nms=True, max_num=500))) + point_cloud_range = [-54, -54, -5.0, 54, 54, 3.0] file_client_args = dict(backend='disk') class_names = [ diff --git a/configs/dgcnn/README.md b/configs/dgcnn/README.md new file mode 100644 index 0000000000..20819f5b25 --- /dev/null +++ b/configs/dgcnn/README.md @@ -0,0 +1,55 @@ +# Dynamic Graph CNN for Learning on Point Clouds + +> [Dynamic Graph CNN for Learning on Point Clouds](https://arxiv.org/abs/1801.07829) + + + +## Abstract + +Point clouds provide a flexible geometric representation suitable for countless applications in computer graphics; they also comprise the raw output of most 3D data acquisition devices. While hand-designed features on point clouds have long been proposed in graphics and vision, however, the recent overwhelming success of convolutional neural networks (CNNs) for image analysis suggests the value of adapting insight from CNN to the point cloud world. Point clouds inherently lack topological information so designing a model to recover topology can enrich the representation power of point clouds. To this end, we propose a new neural network module dubbed EdgeConv suitable for CNN-based high-level tasks on point clouds including classification and segmentation. EdgeConv acts on graphs dynamically computed in each layer of the network. It is differentiable and can be plugged into existing architectures. Compared to existing modules operating in extrinsic space or treating each point independently, EdgeConv has several appealing properties: It incorporates local neighborhood information; it can be stacked applied to learn global shape properties; and in multi-layer systems affinity in feature space captures semantic characteristics over potentially long distances in the original embedding. We show the performance of our model on standard benchmarks including ModelNet40, ShapeNetPart, and S3DIS. + +
+ +
+ +## Introduction + +We implement DGCNN and provide the results and checkpoints on S3DIS dataset. + +**Notice**: We follow the implementations in the original DGCNN paper and a PyTorch implementation of DGCNN [code](https://github.com/AnTao97/dgcnn.pytorch). + +## Results and models + +### S3DIS + +| Method | Split | Lr schd | Mem (GB) | Inf time (fps) | mIoU (Val set) | Download | +| :-------------------------------------------------------------------------: | :----: | :--------: | :------: | :------------: | :------------: | :----------------------: | +| [DGCNN](./dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class.py) | Area_1 | cosine 100e | 13.1 | | 68.33 | [model](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area1/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210731_000734-39658f14.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area1/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210731_000734.log.json) | +| [DGCNN](./dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class.py) | Area_2 | cosine 100e | 13.1 | | 40.68 | [model](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area2/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210731_144648-aea9ecb6.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area2/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210731_144648.log.json) | +| [DGCNN](./dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class.py) | Area_3 | cosine 100e | 13.1 | | 69.38 | [model](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area3/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210801_154629-2ff50ee0.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area3/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210801_154629.log.json) | +| [DGCNN](./dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class.py) | Area_4 | cosine 100e | 13.1 | | 50.07 | [model](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area4/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210802_073551-dffab9cd.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area4/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210802_073551.log.json) | +| [DGCNN](./dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class.py) | Area_5 | cosine 100e | 13.1 | | 50.59 | [model](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area5/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210730_235824-f277e0c5.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area5/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210730_235824.log.json) | +| [DGCNN](./dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class.py) | Area_6 | cosine 100e | 13.1 | | 77.94 | [model](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area6/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210802_154317-e3511b32.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area6/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210802_154317.log.json) | +| [DGCNN](./dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class.py) | 6-fold | | | | 59.43 | | + +**Notes:** + +- We use XYZ+Color+Normalized_XYZ as input in all the experiments on S3DIS datasets. +- `Area_5` Split means training the model on Area_1, 2, 3, 4, 6 and testing on Area_5. +- `6-fold` Split means the overall result of 6 different splits (Area_1, Area_2, Area_3, Area_4, Area_5 and Area_6 Splits). +- Users need to modify `train_area` and `test_area` in the S3DIS dataset's [config](./configs/_base_/datasets/s3dis_seg-3d-13class.py) to set the training and testing areas, respectively. + +## Indeterminism + +Since DGCNN testing adopts sliding patch inference which involves random point sampling, and the test script uses fixed random seeds while the random seeds of validation in training are not fixed, the test results may be slightly different from the results reported above. + +## Citation + +```latex +@article{dgcnn, + title={Dynamic Graph CNN for Learning on Point Clouds}, + author={Wang, Yue and Sun, Yongbin and Liu, Ziwei and Sarma, Sanjay E. and Bronstein, Michael M. and Solomon, Justin M.}, + journal={ACM Transactions on Graphics (TOG)}, + year={2019} +} +``` diff --git a/configs/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class.py b/configs/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class.py new file mode 100644 index 0000000000..6f1b5822a2 --- /dev/null +++ b/configs/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class.py @@ -0,0 +1,24 @@ +_base_ = [ + '../_base_/datasets/s3dis_seg-3d-13class.py', '../_base_/models/dgcnn.py', + '../_base_/schedules/seg_cosine_100e.py', '../_base_/default_runtime.py' +] + +# data settings +data = dict(samples_per_gpu=32) +evaluation = dict(interval=2) + +# model settings +model = dict( + backbone=dict(in_channels=9), # [xyz, rgb, normalized_xyz] + decode_head=dict( + num_classes=13, ignore_index=13, + loss_decode=dict(class_weight=None)), # S3DIS doesn't use class_weight + test_cfg=dict( + num_points=4096, + block_size=1.0, + sample_rate=0.5, + use_normalized_coord=True, + batch_size=24)) + +# runtime settings +checkpoint_config = dict(interval=2) diff --git a/configs/dgcnn/metafile.yml b/configs/dgcnn/metafile.yml new file mode 100644 index 0000000000..87ff9156bc --- /dev/null +++ b/configs/dgcnn/metafile.yml @@ -0,0 +1,24 @@ +Collections: + - Name: DGCNN + Metadata: + Training Techniques: + - SGD + Training Resources: 4x Titan XP GPUs + Architecture: + - DGCNN + Paper: https://arxiv.org/abs/1801.07829 + README: configs/dgcnn/README.md + +Models: + - Name: dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class.py + In Collection: DGCNN + Config: configs/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class.py + Metadata: + Training Data: S3DIS + Training Memory (GB): 13.3 + Results: + - Task: 3D Semantic Segmentation + Dataset: S3DIS + Metrics: + mIoU: 50.59 + Weights: https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area5/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210730_235824-f277e0c5.pth diff --git a/configs/dynamic_voxelization/README.md b/configs/dynamic_voxelization/README.md index eab62d48f0..18ac4309cc 100644 --- a/configs/dynamic_voxelization/README.md +++ b/configs/dynamic_voxelization/README.md @@ -1,24 +1,22 @@ # Dynamic Voxelization -## Introduction +> [End-to-End Multi-View Fusion for 3D Object Detection in LiDAR Point Clouds](https://arxiv.org/abs/1910.06528) -We implement Dynamic Voxelization proposed in and provide its results and models on KITTI dataset. +## Abstract -``` -@article{zhou2019endtoend, - title={End-to-End Multi-View Fusion for 3D Object Detection in LiDAR Point Clouds}, - author={Yin Zhou and Pei Sun and Yu Zhang and Dragomir Anguelov and Jiyang Gao and Tom Ouyang and James Guo and Jiquan Ngiam and Vijay Vasudevan}, - year={2019}, - eprint={1910.06528}, - archivePrefix={arXiv}, - primaryClass={cs.CV} -} +Recent work on 3D object detection advocates point cloud voxelization in birds-eye view, where objects preserve their physical dimensions and are naturally separable. When represented in this view, however, point clouds are sparse and have highly variable point density, which may cause detectors difficulties in detecting distant or small objects (pedestrians, traffic signs, etc.). On the other hand, perspective view provides dense observations, which could allow more favorable feature encoding for such cases. In this paper, we aim to synergize the birds-eye view and the perspective view and propose a novel end-to-end multi-view fusion (MVF) algorithm, which can effectively learn to utilize the complementary information from both. Specifically, we introduce dynamic voxelization, which has four merits compared to existing voxelization methods, i) removing the need of pre-allocating a tensor with fixed size; ii) overcoming the information loss due to stochastic point/voxel dropout; iii) yielding deterministic voxel embeddings and more stable detection outcomes; iv) establishing the bi-directional relationship between points and voxels, which potentially lays a natural foundation for cross-view feature fusion. By employing dynamic voxelization, the proposed feature fusion architecture enables each point to learn to fuse context information from different views. MVF operates on points and can be naturally extended to other approaches using LiDAR point clouds. We evaluate our MVF model extensively on the newly released Waymo Open Dataset and on the KITTI dataset and demonstrate that it significantly improves detection accuracy over the comparable single-view PointPillars baseline. -``` +
+ +
-## Results +## Introduction + +We implement Dynamic Voxelization proposed in and provide its results and models on KITTI dataset. + +## Results and models ### KITTI @@ -27,3 +25,16 @@ We implement Dynamic Voxelization proposed in and provide its results and model |[SECOND](./dv_second_secfpn_6x8_80e_kitti-3d-car.py)|Car |cyclic 80e|5.5||78.83|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_second_secfpn_6x8_80e_kitti-3d-car/dv_second_secfpn_6x8_80e_kitti-3d-car_20200620_235228-ac2c1c0c.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_second_secfpn_6x8_80e_kitti-3d-car/dv_second_secfpn_6x8_80e_kitti-3d-car_20200620_235228.log.json)| |[SECOND](./dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class.py)| 3 Class|cosine 80e|5.5||65.10|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class_20200620_231010-6aa607d3.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class_20200620_231010.log.json)| |[PointPillars](./dv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py)| Car|cyclic 80e|4.7||77.76|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car_20200620_230844-ee7b75c9.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car_20200620_230844.log.json)| + +## Citation + +```latex +@article{zhou2019endtoend, + title={End-to-End Multi-View Fusion for 3D Object Detection in LiDAR Point Clouds}, + author={Yin Zhou and Pei Sun and Yu Zhang and Dragomir Anguelov and Jiyang Gao and Tom Ouyang and James Guo and Jiquan Ngiam and Vijay Vasudevan}, + year={2019}, + eprint={1910.06528}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` diff --git a/configs/fcos3d/README.md b/configs/fcos3d/README.md index 5e22e27606..be517ec405 100644 --- a/configs/fcos3d/README.md +++ b/configs/fcos3d/README.md @@ -1,30 +1,24 @@ # FCOS3D: Fully Convolutional One-Stage Monocular 3D Object Detection -## Introduction +> [FCOS3D: Fully Convolutional One-Stage Monocular 3D Object Detection](https://arxiv.org/abs/2104.10956) +## Abstract + +Monocular 3D object detection is an important task for autonomous driving considering its advantage of low cost. It is much more challenging than conventional 2D cases due to its inherent ill-posed property, which is mainly reflected in the lack of depth information. Recent progress on 2D detection offers opportunities to better solving this problem. However, it is non-trivial to make a general adapted 2D detector work in this 3D task. In this paper, we study this problem with a practice built on a fully convolutional single-stage detector and propose a general framework FCOS3D. Specifically, we first transform the commonly defined 7-DoF 3D targets to the image domain and decouple them as 2D and 3D attributes. Then the objects are distributed to different feature levels with consideration of their 2D scales and assigned only according to the projected 3D-center for the training procedure. Furthermore, the center-ness is redefined with a 2D Gaussian distribution based on the 3D-center to fit the 3D target formulation. All of these make this framework simple yet effective, getting rid of any 2D detection or 2D-3D correspondence priors. Our solution achieves 1st place out of all the vision-only methods in the nuScenes 3D detection challenge of NeurIPS 2020. + +
+ +
+ +## Introduction + FCOS3D is a general anchor-free, one-stage monocular 3D object detector adapted from the original 2D version FCOS. It serves as a baseline built on top of mmdetection and mmdetection3d for 3D detection based on monocular vision. Currently we first support the benchmark on the large-scale nuScenes dataset, which achieved 1st place out of all the vision-only methods in the [nuScenes 3D detecton challenge](https://www.nuscenes.org/object-detection?externalData=all&mapData=all&modalities=Camera) of NeurIPS 2020. -``` -@inproceedings{wang2021fcos3d, - title={{FCOS3D: Fully} Convolutional One-Stage Monocular 3D Object Detection}, - author={Wang, Tai and Zhu, Xinge and Pang, Jiangmiao and Lin, Dahua}, - booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, - year={2021} -} -# For the original 2D version -@inproceedings{tian2019fcos, - title = {{FCOS: Fully} Convolutional One-Stage Object Detection}, - author = {Tian, Zhi and Shen, Chunhua and Chen, Hao and He, Tong}, - booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)}, - year = {2019} -} -``` - ![demo image](../../resources/browse_dataset_mono.png) ## Usage @@ -52,7 +46,7 @@ Due to the scale and measurements of depth is different from those of other regr We also provide visualization functions to show the monocular 3D detection results. Simply follow the [documentation](https://mmdetection3d.readthedocs.io/en/latest/1_exist_data_model.html#test-existing-models-on-standard-datasets) and use the `single-gpu testing` command. You only need to add the `--show` flag and specify `--show-dir` to store the visualization results. -## Results +## Results and models ### NuScenes @@ -61,3 +55,21 @@ We also provide visualization functions to show the monocular 3D detection resul |[ResNet101 w/ DCN](./fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d.py)|1x|8.69||29.8|37.7|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_20210715_235813-4bed5239.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_20210715_235813.log.json)| |[above w/ finetune](./fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_finetune.py)|1x|8.69||32.1|39.5|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_finetune/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_finetune_20210717_095645-8d806dc2.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_finetune/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_finetune_20210717_095645.log.json)| |above w/ tta|1x|8.69||33.1|40.3|| + +## Citation + +```latex +@inproceedings{wang2021fcos3d, + title={{FCOS3D: Fully} Convolutional One-Stage Monocular 3D Object Detection}, + author={Wang, Tai and Zhu, Xinge and Pang, Jiangmiao and Lin, Dahua}, + booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, + year={2021} +} +# For the original 2D version +@inproceedings{tian2019fcos, + title = {{FCOS: Fully} Convolutional One-Stage Object Detection}, + author = {Tian, Zhi and Shen, Chunhua and Chen, Hao and He, Tong}, + booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)}, + year = {2019} +} +``` diff --git a/configs/fp16/README.md b/configs/fp16/README.md deleted file mode 100644 index 697d1b8bd5..0000000000 --- a/configs/fp16/README.md +++ /dev/null @@ -1,32 +0,0 @@ -# Mixed Precision Training - -## Introduction - - - -We implement mixed precision training and apply it to VoxelNets (e.g., SECOND and PointPillars). -The results are in the following tables. - -**Note**: For mixed precision training, we currently do not support PointNet-based methods (e.g., VoteNet). -Mixed precision training for PointNet-based methods will be supported in the future release. - -## Results - -### SECOND on KITTI dataset - -| Backbone |Class| Lr schd | FP32 Mem (GB) | FP16 Mem (GB) | FP32 mAP | FP16 mAP |Download | -| :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: | :------: | -| [SECFPN](./hv_second_secfpn_fp16_6x8_80e_kitti-3d-car.py)| Car |cyclic 80e|5.4|2.9|79.07|78.72|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car_20200924_211301-1f5ad833.pth)| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car_20200924_211301.log.json)| -| [SECFPN](./hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py)| 3 Class |cyclic 80e|5.4|2.9|64.41|67.4|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class_20200925_110059-05f67bdf.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class_20200925_110059.log.json)| - -### PointPillars on nuScenes dataset - -| Backbone | Lr schd | FP32 Mem (GB) | FP16 Mem (GB) | FP32 mAP | FP32 NDS| FP16 mAP | FP16 NDS| Download | -| :---------: | :-----: | :------: | :------------: | :----: |:----: | :----: |:----: | :------: | -|[SECFPN](./hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d.py)|2x|16.4|8.37|35.17|49.7|35.19|50.27|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d_20201020_222626-c3f0483e.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d_20201020_222626.log.json)| -|[FPN](./hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d.py)|2x|16.4|8.40|40.0|53.3|39.26|53.26|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d_20201021_120719-269f9dd6.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d_20201021_120719.log.json)| - -**Note**: -1. With mixed precision training, we can train PointPillars with nuScenes dataset on 8 Titan XP GPUS with batch size of 2. -This will cause OOM error without mixed precision training. -2. The loss scale for PointPillars on nuScenes dataset is specifically tuned to avoid the loss to be Nan. We find 32 is more stable than 512, though loss scale 32 still cause Nan sometimes. diff --git a/configs/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py b/configs/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py deleted file mode 100644 index 0632a04842..0000000000 --- a/configs/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py +++ /dev/null @@ -1,3 +0,0 @@ -_base_ = '../second/hv_second_secfpn_6x8_80e_kitti-3d-3class.py' -# fp16 settings -fp16 = dict(loss_scale=512.) diff --git a/configs/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car.py b/configs/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car.py deleted file mode 100644 index a2aae1518e..0000000000 --- a/configs/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car.py +++ /dev/null @@ -1,3 +0,0 @@ -_base_ = '../second/hv_second_secfpn_6x8_80e_kitti-3d-car.py' -# fp16 settings -fp16 = dict(loss_scale=512.) diff --git a/configs/fp16/metafile.yml b/configs/fp16/metafile.yml deleted file mode 100644 index 66b7aca717..0000000000 --- a/configs/fp16/metafile.yml +++ /dev/null @@ -1,70 +0,0 @@ -Collections: - - Name: FP16 - Metadata: - Training Techniques: - - AdamW - - Mixed Precision Training - Training Resources: 8x TITAN Xp - Architecture: - - Hard Voxelization - Paper: - URL: https://arxiv.org/abs/1710.03740 - Title: 'Mixed Precision Training' - README: configs/fp16/README.md - Code: - Version: v0.7.0 - -Models: - - Name: hv_second_secfpn_fp16_6x8_80e_kitti-3d-car - In Collection: FP16 - Config: configs/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car.py - Metadata: - Training Data: KITTI - Training Memory (GB): 2.9 - Results: - - Task: 3D Object Detection - Dataset: KITTI - Metrics: - FP16 mAP: 78.72 - Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car_20200924_211301-1f5ad833.pth - - - Name: hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class - In Collection: FP16 - Config: configs/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py - Metadata: - Training Data: KITTI - Training Memory (GB): 2.9 - Results: - - Task: 3D Object Detection - Dataset: KITTI - Metrics: - FP16 mAP: 67.4 - Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class_20200925_110059-05f67bdf.pth - - - Name: hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d - In Collection: FP16 - Config: configs/fp16/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d.py - Metadata: - Training Data: nuScenes - Training Memory (GB): 8.37 - Results: - - Task: 3D Object Detection - Dataset: nuScenes - Metrics: - FP16 mAP: 35.19 - FP16 NDS: 50.27 - Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d_20201020_222626-c3f0483e.pth - - - Name: hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d - In Collection: FP16 - Config: configs/fp16/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d.py - Metadata: - Training Data: nuScenes - Training Memory (GB): 8.40 - Results: - - Task: 3D Object Detection - Dataset: nuScenes - Metrics: - FP16 mAP: 39.26 - FP16 NDS: 53.26 - Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d_20201021_120719-269f9dd6.pth diff --git a/configs/free_anchor/README.md b/configs/free_anchor/README.md index b38cf3c2f9..c92a43952a 100644 --- a/configs/free_anchor/README.md +++ b/configs/free_anchor/README.md @@ -1,22 +1,23 @@ # FreeAnchor for 3D Object Detection -## Introduction +> [FreeAnchor: Learning to Match Anchors for Visual Object Detection](https://arxiv.org/abs/1909.02466) +## Abstract + +Modern CNN-based object detectors assign anchors for ground-truth objects under the restriction of object-anchor Intersection-over-Unit (IoU). In this study, we propose a learning-to-match approach to break IoU restriction, allowing objects to match anchors in a flexible manner. Our approach, referred to as FreeAnchor, updates hand-crafted anchor assignment to “free" anchor matching by formulating detector training as a maximum likelihood estimation (MLE) procedure. FreeAnchor targets at learning features which best explain a class of objects in terms of both classification and localization. FreeAnchor is implemented by optimizing detection customized likelihood and can be fused with CNN-based detectors in a plug-and-play manner. Experiments on COCO demonstrate that FreeAnchor consistently outperforms the counterparts with significant margins. + +
+ +
+ +## Introduction + We implement FreeAnchor in 3D detection systems and provide their first results with PointPillars on nuScenes dataset. With the implemented `FreeAnchor3DHead`, a PointPillar detector with a big backbone (e.g., RegNet-3.2GF) achieves top performance on the nuScenes benchmark. -``` -@inproceedings{zhang2019freeanchor, - title = {{FreeAnchor}: Learning to Match Anchors for Visual Object Detection}, - author = {Zhang, Xiaosong and Wan, Fang and Liu, Chang and Ji, Rongrong and Ye, Qixiang}, - booktitle = {Neural Information Processing Systems}, - year = {2019} -} -``` - ## Usage ### Modify config @@ -49,8 +50,8 @@ model = dict( ranges=[[-50, -50, -1.8, 50, 50, -1.8]], scales=[1, 2, 4], sizes=[ - [0.8660, 2.5981, 1.], # 1.5/sqrt(3) - [0.5774, 1.7321, 1.], # 1/sqrt(3) + [2.5981, 0.8660, 1.], # 1.5 / sqrt(3) + [1.7321, 0.5774, 1.], # 1 / sqrt(3) [1., 1., 1.], [0.4, 0.4, 1], ], @@ -59,8 +60,7 @@ model = dict( reshape_out=True), assigner_per_size=False, diff_rad_by_sin=True, - dir_offset=0.7854, # pi/4 - dir_limit_offset=0, + dir_offset=-0.7854, # -pi / 4 bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9), loss_cls=dict( type='FocalLoss', @@ -76,7 +76,7 @@ model = dict( pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.25, 0.25]))) ``` -## Results +## Results and models ### PointPillars @@ -92,3 +92,14 @@ model = dict( |[RegNetX-3.2GF-FPN](./hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d.py)*|✓|3x|29.5||55.09|63.5|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/free_anchor/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d_20200629_181452-297fdc66.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/free_anchor/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d_20200629_181452.log.json)| **Note**: Models noted by `*` means it is trained using stronger augmentation with vertical flip under bird-eye-view, global translation, and larger range of global rotation. + +## Citation + +```latex +@inproceedings{zhang2019freeanchor, + title = {{FreeAnchor}: Learning to Match Anchors for Visual Object Detection}, + author = {Zhang, Xiaosong and Wan, Fang and Liu, Chang and Ji, Rongrong and Ye, Qixiang}, + booktitle = {Neural Information Processing Systems}, + year = {2019} +} +``` diff --git a/configs/free_anchor/hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py b/configs/free_anchor/hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py index d0a989f121..7412b93085 100644 --- a/configs/free_anchor/hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py +++ b/configs/free_anchor/hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py @@ -21,8 +21,8 @@ ranges=[[-50, -50, -1.8, 50, 50, -1.8]], scales=[1, 2, 4], sizes=[ - [0.8660, 2.5981, 1.], # 1.5/sqrt(3) - [0.5774, 1.7321, 1.], # 1/sqrt(3) + [2.5981, 0.8660, 1.], # 1.5 / sqrt(3) + [1.7321, 0.5774, 1.], # 1 / sqrt(3) [1., 1., 1.], [0.4, 0.4, 1], ], @@ -31,8 +31,7 @@ reshape_out=True), assigner_per_size=False, diff_rad_by_sin=True, - dir_offset=0.7854, # pi/4 - dir_limit_offset=0, + dir_offset=-0.7854, # -pi / 4 bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9), loss_cls=dict( type='FocalLoss', diff --git a/configs/groupfree3d/README.md b/configs/groupfree3d/README.md index fede6af7f4..5c0a104b09 100644 --- a/configs/groupfree3d/README.md +++ b/configs/groupfree3d/README.md @@ -1,21 +1,22 @@ # Group-Free 3D Object Detection via Transformers -## Introduction +> [Group-Free 3D Object Detection via Transformers](https://arxiv.org/abs/2104.00678) -We implement Group-Free-3D and provide the result and checkpoints on ScanNet datasets. +## Abstract -``` -@article{liu2021, - title={Group-Free 3D Object Detection via Transformers}, - author={Liu, Ze and Zhang, Zheng and Cao, Yue and Hu, Han and Tong, Xin}, - journal={Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)}, - year={2021} -} -``` +Recently, directly detecting 3D objects from 3D point clouds has received increasing attention. To extract object representation from an irregular point cloud, existing methods usually take a point grouping step to assign the points to an object candidate so that a PointNet-like network could be used to derive object features from the grouped points. However, the inaccurate point assignments caused by the hand-crafted grouping scheme decrease the performance of 3D object detection. In this paper, we present a simple yet effective method for directly detecting 3D objects from the 3D point cloud. Instead of grouping local points to each object candidate, our method computes the feature of an object from all the points in the point cloud with the help of an attention mechanism in the Transformers, where the contribution of each point is automatically learned in the network training. With an improved attention stacking scheme, our method fuses object features in different stages and generates more accurate object detection results. With few bells and whistles, the proposed method achieves state-of-the-art 3D object detection performance on two widely used benchmarks, ScanNet V2 and SUN RGB-D. + +
+ +
+ +## Introduction + +We implement Group-Free-3D and provide the result and checkpoints on ScanNet datasets. -## Results +## Results and models ### ScanNet @@ -30,3 +31,14 @@ We implement Group-Free-3D and provide the result and checkpoints on ScanNet dat - We report the best results (AP@0.50) on validation set during each training. * means the evaluation method in the paper: we train each setting 5 times and test each training trial 5 times, then the average performance of these 25 trials is reported to account for algorithm randomness. - We use 4 GPUs for training by default as the original code. + +## Citation + +```latex +@article{liu2021, + title={Group-Free 3D Object Detection via Transformers}, + author={Liu, Ze and Zhang, Zheng and Cao, Yue and Hu, Han and Tong, Xin}, + journal={Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)}, + year={2021} +} +``` diff --git a/configs/h3dnet/README.md b/configs/h3dnet/README.md index 07b63d63a7..c01338930c 100644 --- a/configs/h3dnet/README.md +++ b/configs/h3dnet/README.md @@ -1,21 +1,22 @@ # H3DNet: 3D Object Detection Using Hybrid Geometric Primitives -## Introduction +> [H3DNet: 3D Object Detection Using Hybrid Geometric Primitives](https://arxiv.org/abs/2006.05682) -We implement H3DNet and provide the result and checkpoints on ScanNet datasets. +## Abstract -``` -@inproceedings{zhang2020h3dnet, - author = {Zhang, Zaiwei and Sun, Bo and Yang, Haitao and Huang, Qixing}, - title = {H3DNet: 3D Object Detection Using Hybrid Geometric Primitives}, - booktitle = {Proceedings of the European Conference on Computer Vision}, - year = {2020} -} -``` +We introduce H3DNet, which takes a colorless 3D point cloud as input and outputs a collection of oriented object bounding boxes (or BB) and their semantic labels. The critical idea of H3DNet is to predict a hybrid set of geometric primitives, i.e., BB centers, BB face centers, and BB edge centers. We show how to convert the predicted geometric primitives into object proposals by defining a distance function between an object and the geometric primitives. This distance function enables continuous optimization of object proposals, and its local minimums provide high-fidelity object proposals. H3DNet then utilizes a matching and refinement module to classify object proposals into detected objects and fine-tune the geometric parameters of the detected objects. The hybrid set of geometric primitives not only provides more accurate signals for object detection than using a single type of geometric primitives, but it also provides an overcomplete set of constraints on the resulting 3D layout. Therefore, H3DNet can tolerate outliers in predicted geometric primitives. Our model achieves state-of-the-art 3D detection results on two large datasets with real 3D scans, ScanNet and SUN RGB-D. + +
+ +
+ +## Introduction + +We implement H3DNet and provide the result and checkpoints on ScanNet datasets. -## Results +## Results and models ### ScanNet @@ -30,3 +31,14 @@ python ./tools/model_converters/convert_h3dnet_checkpoints.py ${ORIGINAL_CHECKPO ``` Then you can use the converted checkpoints following [getting_started.md](../../docs/en/getting_started.md). + +## Citation + +```latex +@inproceedings{zhang2020h3dnet, + author = {Zhang, Zaiwei and Sun, Bo and Yang, Haitao and Huang, Qixing}, + title = {H3DNet: 3D Object Detection Using Hybrid Geometric Primitives}, + booktitle = {Proceedings of the European Conference on Computer Vision}, + year = {2020} +} +``` diff --git a/configs/imvotenet/README.md b/configs/imvotenet/README.md index f1e09ad802..5e1d66ccb9 100644 --- a/configs/imvotenet/README.md +++ b/configs/imvotenet/README.md @@ -1,22 +1,22 @@ # ImVoteNet: Boosting 3D Object Detection in Point Clouds with Image Votes -## Introduction +> [ImVoteNet: Boosting 3D Object Detection in Point Clouds with Image Votes](https://arxiv.org/abs/2001.10692) -We implement ImVoteNet and provide the result and checkpoints on SUNRGBD. +## Abstract -``` -@inproceedings{qi2020imvotenet, - title={Imvotenet: Boosting 3D object detection in point clouds with image votes}, - author={Qi, Charles R and Chen, Xinlei and Litany, Or and Guibas, Leonidas J}, - booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition}, - pages={4404--4413}, - year={2020} -} -``` +3D object detection has seen quick progress thanks to advances in deep learning on point clouds. A few recent works have even shown state-of-the-art performance with just point clouds input (e.g. VOTENET). However, point cloud data have inherent limitations. They are sparse, lack color information and often suffer from sensor noise. Images, on the other hand, have high resolution and rich texture. Thus they can complement the 3D geometry provided by point clouds. Yet how to effectively use image information to assist point cloud based detection is still an open question. In this work, we build on top of VOTENET and propose a 3D detection architecture called IMVOTENET specialized for RGB-D scenes. IMVOTENET is based on fusing 2D votes in images and 3D votes in point clouds. Compared to prior work on multi-modal detection, we explicitly extract both geometric and semantic features from the 2D images. We leverage camera parameters to lift these features to 3D. To improve the synergy of 2D-3D feature fusion, we also propose a multi-tower training scheme. We validate our model on the challenging SUN RGB-D dataset, advancing state-of-the-art results by 5.7 mAP. We also provide rich ablation studies to analyze the contribution of each design choice. + +
+ +
+ +## Introduction + +We implement ImVoteNet and provide the result and checkpoints on SUNRGBD. -## Results +## Results and models ### SUNRGBD-2D (Stage 1, image branch pre-train) @@ -29,3 +29,15 @@ We implement ImVoteNet and provide the result and checkpoints on SUNRGBD. | Backbone | Lr schd | Mem (GB) | Inf time (fps) | AP@0.25 |AP@0.5| Download | | :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: | | [PointNet++](./imvotenet_stage2_16x8_sunrgbd-3d-10class.py) | 3x |9.4| |64.04||[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/imvotenet/imvotenet_stage2_16x8_sunrgbd-3d-10class/imvotenet_stage2_16x8_sunrgbd-3d-10class_20210323_184021-d44dcb66.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/imvotenet/imvotenet_stage2_16x8_sunrgbd-3d-10class/imvotenet_stage2_16x8_sunrgbd-3d-10class_20210323_184021.log.json)| + +## Citation + +```latex +@inproceedings{qi2020imvotenet, + title={Imvotenet: Boosting 3D object detection in point clouds with image votes}, + author={Qi, Charles R and Chen, Xinlei and Litany, Or and Guibas, Leonidas J}, + booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition}, + pages={4404--4413}, + year={2020} +} +``` diff --git a/configs/imvoxelnet/README.md b/configs/imvoxelnet/README.md index 5f3c5f8199..e79975442e 100644 --- a/configs/imvoxelnet/README.md +++ b/configs/imvoxelnet/README.md @@ -1,27 +1,38 @@ # ImVoxelNet: Image to Voxels Projection for Monocular and Multi-View General-Purpose 3D Object Detection -## Introduction +> [ImVoxelNet: Image to Voxels Projection for Monocular and Multi-View General-Purpose 3D Object Detection](https://arxiv.org/abs/2106.01178) +## Abstract + +In this paper, we introduce the task of multi-view RGB-based 3D object detection as an end-to-end optimization problem. To address this problem, we propose ImVoxelNet, a novel fully convolutional method of 3D object detection based on posed monocular or multi-view RGB images. The number of monocular images in each multiview input can variate during training and inference; actually, this number might be unique for each multi-view input. ImVoxelNet successfully handles both indoor and outdoor scenes, which makes it general-purpose. Specifically, it achieves state-of-the-art results in car detection on KITTI (monocular) and nuScenes (multi-view) benchmarks among all methods that accept RGB images. Moreover, it surpasses existing RGB-based 3D object detection methods on the SUN RGB-D dataset. On ScanNet, ImVoxelNet sets a new benchmark for multi-view 3D object detection. + +
+ +
+ +## Introduction + We implement a monocular 3D detector ImVoxelNet and provide its results and checkpoints on KITTI dataset. Results for SUN RGB-D, ScanNet and nuScenes are currently available in ImVoxelNet authors [repo](https://github.com/saic-vul/imvoxelnet) (based on mmdetection3d). -``` -@inproceedings{rukhovich2022imvoxelnet, - title={Imvoxelnet: Image to voxels projection for monocular and multi-view general-purpose 3d object detection}, - author={Rukhovich, Danila and Vorontsova, Anna and Konushin, Anton}, - booktitle={Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision}, - pages={2397--2406}, - year={2022} -} -``` - -## Results +## Results and models ### KITTI | Backbone |Class| Lr schd | Mem (GB) | Inf time (fps) | mAP | Download | | :---------: | :-----: |:-----: | :------: | :------------: | :----: |:----: | | [ResNet-50](./imvoxelnet_kitti-3d-car.py) | Car | 3x | | |17.4|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/imvoxelnet/imvoxelnet_kitti-3d-car_20210610_152323-b9abba85.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/imvoxelnet/imvoxelnet_kitti-3d-car_20210610_152323.log.json)| + +## Citation + +```latex +@article{rukhovich2021imvoxelnet, + title={ImVoxelNet: Image to Voxels Projection for Monocular and Multi-View General-Purpose 3D Object Detection}, + author={Danila Rukhovich, Anna Vorontsova, Anton Konushin}, + journal={arXiv preprint arXiv:2106.01178}, + year={2021} +} +``` diff --git a/configs/imvoxelnet/imvoxelnet_4x8_kitti-3d-car.py b/configs/imvoxelnet/imvoxelnet_4x8_kitti-3d-car.py index 47932d7f6d..06ebe62a2a 100644 --- a/configs/imvoxelnet/imvoxelnet_4x8_kitti-3d-car.py +++ b/configs/imvoxelnet/imvoxelnet_4x8_kitti-3d-car.py @@ -25,7 +25,7 @@ anchor_generator=dict( type='AlignedAnchor3DRangeGenerator', ranges=[[-0.16, -39.68, -1.78, 68.96, 39.68, -1.78]], - sizes=[[1.6, 3.9, 1.56]], + sizes=[[3.9, 1.6, 1.56]], rotations=[0, 1.57], reshape_out=True), diff_rad_by_sin=True, diff --git a/configs/monoflex/README.md b/configs/monoflex/README.md new file mode 100644 index 0000000000..938dcd12dd --- /dev/null +++ b/configs/monoflex/README.md @@ -0,0 +1,48 @@ +# Objects are Different: Flexible Monocular 3D Object Detection + +> [Objects are Different: Flexible Monocular 3D Object Detection](https://arxiv.org/abs/2104.02323) + + + +## Abstract + +The precise localization of 3D objects from a single image without depth information is a highly challenging problem. Most existing methods adopt the same approach for all objects regardless of their diverse distributions, leading to limited performance for truncated objects. In this paper, we propose a flexible framework for monocular 3D object detection which explicitly decouples the truncated objects and adaptively combines multiple approaches for object depth estimation. Specifically, we decouple the edge of the feature map for predicting long-tail truncated objects so that the optimization of normal objects is not influenced. Furthermore, we formulate the object depth estimation as an uncertainty-guided ensemble of directly regressed object depth and solved depths from different groups of keypoints. Experiments demonstrate that our method outperforms the state-of-the-art method by relatively 27% for the moderate level and 30% for the hard level in the test set of KITTI benchmark while maintaining real-time efficiency. + +
+ +
+ +## Introduction + +We implement MonoFlex and provide the results and checkpoints on KITTI dataset. + +## Results and models + +### KITTI + +| Backbone | Lr schd | Mem (GB) | Inf time (fps) | mAP | Download | +| :---------: | :-----: | :------: | :------------: | :----: | :------: | +|[DLA34](./monoflex_dla34_pytorch_dlaneck_gn-all_2x4_6x_kitti-mono3d.py)|6x|9.64||21.86|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/monoflex/monoflex_dla34_pytorch_dlaneck_gn-all_2x4_6x_kitti-mono3d_20211228_027553-d46d9bb0.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/monoflex/monoflex_dla34_pytorch_dlaneck_gn-all_2x4_6x_kitti-mono3d_20211228_027553.log.json) + +Note: mAP represents Car moderate 3D strict AP11 results. +Detailed performance on KITTI 3D detection (3D/BEV) is as follows, evaluated by AP11 and AP40 metric: + +| | Easy | Moderate | Hard | +|-------------|:-------------:|:--------------:|:-------------:| +| Car (AP11) | 28.02 / 36.11 | 21.86 / 29.46 | 19.01 / 24.83 | +| Car (AP40) | 23.22 / 32.74 | 17.18 / 24.02 | 15.13 / 20.67 | + +Note: mAP represents Car moderate 3D strict AP11 / AP40 results. Because of the limited data for pedestrians and cyclists, the detection performance for these two classes is usually unstable. Therefore, we only list car detection results here. In addition, the AP11 result may fluctuate in a larger range (~1 AP), so AP40 is a more recommended metric for reference due to its much better stability. + +## Citation + +```latex +@InProceedings{MonoFlex, + author = {Zhang, Yunpeng and Lu, Jiwen and Zhou, Jie}, + title = {Objects Are Different: Flexible Monocular 3D Object Detection}, + booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + year = {2021}, + pages = {3289-3298} +} +``` diff --git a/configs/monoflex/metafile.yml b/configs/monoflex/metafile.yml new file mode 100644 index 0000000000..c64dd6ffb9 --- /dev/null +++ b/configs/monoflex/metafile.yml @@ -0,0 +1,30 @@ +Collections: + - Name: MonoFlex + Metadata: + Training Data: KITTI + Training Techniques: + - Adam + Training Resources: 2x V100 GPUS + Architecture: + - MonoFlexHead + - DLA + Paper: + URL: https://arxiv.org/abs/2104.02323 + Title: 'Objects are Different: Flexible Monocular 3D Object Detection' + README: configs/monoflex/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/mmdet3d/models/detectors/monoflex.py#L7 + Version: v1.0.0 + +Models: + - Name: monoflex_dla34_pytorch_dlaneck_gn-all_2x4_6x_kitti-mono3d + In Collection: MonoFlex + Config: configs/monoflex/monoflex_dla34_pytorch_dlaneck_gn-all_2x4_6x_kitti-mono3d.py + Metadata: + Training Memory (GB): 9.64 + Results: + - Task: 3D Object Detection + Dataset: KITTI + Metrics: + mAP: 21.98 + Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/monoflex/monoflex_dla34_pytorch_dlaneck_gn-all_2x4_6x_kitti-mono3d_20211228_027553-d46d9bb0.pth diff --git a/configs/mvxnet/README.md b/configs/mvxnet/README.md index 6a56d5563f..b2a34f101a 100644 --- a/configs/mvxnet/README.md +++ b/configs/mvxnet/README.md @@ -1,12 +1,32 @@ # MVX-Net: Multimodal VoxelNet for 3D Object Detection -## Introduction +> [MVX-Net: Multimodal VoxelNet for 3D Object Detection](https://arxiv.org/abs/1904.01649) +## Abstract + +Many recent works on 3D object detection have focused on designing neural network architectures that can consume point cloud data. While these approaches demonstrate encouraging performance, they are typically based on a single modality and are unable to leverage information from other modalities, such as a camera. Although a few approaches fuse data from different modalities, these methods either use a complicated pipeline to process the modalities sequentially, or perform late-fusion and are unable to learn interaction between different modalities at early stages. In this work, we present PointFusion and VoxelFusion: two simple yet effective early-fusion approaches to combine the RGB and point cloud modalities, by leveraging the recently introduced VoxelNet architecture. Evaluation on the KITTI dataset demonstrates significant improvements in performance over approaches which only use point cloud data. Furthermore, the proposed method provides results competitive with the state-of-the-art multimodal algorithms, achieving top-2 ranking in five of the six bird's eye view and 3D detection categories on the KITTI benchmark, by using a simple single stage network. + +
+ +
+ +## Introduction + We implement MVX-Net and provide its results and models on KITTI dataset. -``` +## Results and models + +### KITTI + +| Backbone |Class| Lr schd | Mem (GB) | Inf time (fps) | mAP | Download | +| :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: | +| [SECFPN](./dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class.py)|3 Class|cosine 80e|6.7||63.0|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/mvxnet/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class_20200621_003904-10140f2d.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/mvxnet/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class_20200621_003904.log.json)| + +## Citation + +```latex @inproceedings{sindagi2019mvx, title={MVX-Net: Multimodal voxelnet for 3D object detection}, author={Sindagi, Vishwanath A and Zhou, Yin and Tuzel, Oncel}, @@ -15,13 +35,4 @@ We implement MVX-Net and provide its results and models on KITTI dataset. year={2019}, organization={IEEE} } - ``` - -## Results - -### KITTI - -| Backbone |Class| Lr schd | Mem (GB) | Inf time (fps) | mAP | Download | -| :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: | -| [SECFPN](./dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class.py)|3 Class|cosine 80e|6.7||63.0|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/mvxnet/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class_20200621_003904-10140f2d.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/mvxnet/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class_20200621_003904.log.json)| diff --git a/configs/mvxnet/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class.py b/configs/mvxnet/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class.py index 213b626dcb..e9f592f5f5 100644 --- a/configs/mvxnet/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class.py +++ b/configs/mvxnet/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class.py @@ -74,7 +74,7 @@ [0, -40.0, -0.6, 70.4, 40.0, -0.6], [0, -40.0, -1.78, 70.4, 40.0, -1.78], ], - sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]], + sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]], rotations=[0, 1.57], reshape_out=False), assigner_per_size=True, diff --git a/configs/nuimages/README.md b/configs/nuimages/README.md index 9224013f1d..42267f239d 100644 --- a/configs/nuimages/README.md +++ b/configs/nuimages/README.md @@ -1,9 +1,9 @@ # NuImages Results -## Introduction - +## Introduction + We support and provide some baseline results on [nuImages dataset](https://www.nuscenes.org/nuimages). We follow the class mapping in nuScenes dataset, which maps the original categories into 10 foreground categories. The convert script can be found [here](https://github.com/open-mmlab/mmdetection3d/blob/master/tools/data_converter/nuimage_converter.py). @@ -15,7 +15,7 @@ We will support panoptic segmentation models in the future. The dataset converted by the script of v0.6.0 only supports instance segmentation. Since v0.7.0, we also support to produce semantic segmentation mask of each image; thus, we can train HTC or semantic segmentation models using the dataset. To convert the nuImages dataset into COCO format, please use the command below: ```shell -python -u tools/data_converter/nuimage_converter.py --data-root ${DATA_ROOT} --version ${VERIONS} \ +python -u tools/data_converter/nuimage_converter.py --data-root ${DATA_ROOT} --version ${VERSIONS} \ --out-dir ${OUT_DIR} --nproc ${NUM_WORKERS} --extra-tag ${TAG} ``` @@ -25,7 +25,7 @@ python -u tools/data_converter/nuimage_converter.py --data-root ${DATA_ROOT} --v - `--nproc`: number of workers for data preparation, defaults to `4`. Larger number could reduce the preparation time as images are processed in parallel. - `--extra-tag`: extra tag of the annotations, defaults to `nuimages`. This can be used to separate different annotations processed in different time for study. -## Results +## Results and models ### Instance Segmentation @@ -55,4 +55,4 @@ We report Mask R-CNN and Cascade Mask R-CNN results on nuimages. 1. `IN` means only using ImageNet pre-trained backbone. `IN+COCO-Nx` and `IN+COCO-Ne` means the backbone is first pre-trained on ImageNet, and then the detector is pre-trained on COCO train2017 dataset by `Nx` and `N` epochs schedules, respectively. 2. All the training hyper-parameters follow the standard schedules on COCO dataset except that the images are resized from 1280 x 720 to 1920 x 1080 (relative ratio 0.8 to 1.2) since the images are in size 1600 x 900. -3. The class order in the detectors released in v0.6.0 is different from the order in the configs because the bug in the convertion script. This bug has been fixed since v0.7.0 and the models trained by the correct class order are also released. If you used nuImages since v0.6.0, please re-convert the data through the convertion script using the above-mentioned command. +3. The class order in the detectors released in v0.6.0 is different from the order in the configs because the bug in the conversion script. This bug has been fixed since v0.7.0 and the models trained by the correct class order are also released. If you used nuImages since v0.6.0, please re-convert the data through the conversion script using the above-mentioned command. diff --git a/configs/nuimages/metafile.yml b/configs/nuimages/metafile.yml index 394096c8d5..7b94ce7d17 100644 --- a/configs/nuimages/metafile.yml +++ b/configs/nuimages/metafile.yml @@ -1,16 +1,3 @@ -Collections: - - Name: Mask R-CNN - Metadata: - Training Data: nuImages - Training Techniques: - - SGD with Momentum - Architecture: - - RoI Align - - RPN - README: configs/nuimages/README.md - Code: - Version: v0.6.0 - Models: - Name: mask_rcnn_r50_fpn_1x_nuim In Collection: Mask R-CNN diff --git a/configs/paconv/README.md b/configs/paconv/README.md index 38b31337d6..0b2bc7275e 100644 --- a/configs/paconv/README.md +++ b/configs/paconv/README.md @@ -1,24 +1,25 @@ # PAConv: Position Adaptive Convolution with Dynamic Kernel Assembling on Point Clouds -## Introduction +> [PAConv: Position Adaptive Convolution with Dynamic Kernel Assembling on Point Clouds](https://arxiv.org/abs/2103.14635) -We implement PAConv and provide the result and checkpoints on S3DIS dataset. +## Abstract -``` -@inproceedings{xu2021paconv, - title={PAConv: Position Adaptive Convolution with Dynamic Kernel Assembling on Point Clouds}, - author={Xu, Mutian and Ding, Runyu and Zhao, Hengshuang and Qi, Xiaojuan}, - booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, - pages={3173--3182}, - year={2021} -} -``` +We introduce Position Adaptive Convolution (PAConv), a generic convolution operation for 3D point cloud processing. The key of PAConv is to construct the convolution kernel by dynamically assembling basic weight matrices stored in Weight Bank, where the coefficients of these weight matrices are self-adaptively learned from point positions through ScoreNet. In this way, the kernel is built in a data-driven manner, endowing PAConv with more flexibility than 2D convolutions to better handle the irregular and unordered point cloud data. Besides, the complexity of the learning process is reduced by combining weight matrices instead of brutally predicting kernels from point positions. +Furthermore, different from the existing point convolution operators whose network architectures are often heavily engineered, we integrate our PAConv into classical MLP-based point cloud pipelines without changing network configurations. Even built on simple networks, our method still approaches or even surpasses the state-of-the-art models, and significantly improves baseline performance on both classification and segmentation tasks, yet with decent efficiency. Thorough ablation studies and visualizations are provided to understand PAConv. + +
+ +
+ +## Introduction + +We implement PAConv and provide the result and checkpoints on S3DIS dataset. **Notice**: The original PAConv paper used step learning rate schedule. We discovered that cosine schedule achieves slightly better results and adopt it in our implementations. -## Results +## Results and models ### S3DIS @@ -36,3 +37,15 @@ We implement PAConv and provide the result and checkpoints on S3DIS dataset. ## Indeterminism Since PAConv testing adopts sliding patch inference which involves random point sampling, and the test script uses fixed random seeds while the random seeds of validation in training are not fixed, the test results may be slightly different from the results reported above. + +## Citation + +```latex +@inproceedings{xu2021paconv, + title={PAConv: Position Adaptive Convolution with Dynamic Kernel Assembling on Point Clouds}, + author={Xu, Mutian and Ding, Runyu and Zhao, Hengshuang and Qi, Xiaojuan}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={3173--3182}, + year={2021} +} +``` diff --git a/configs/parta2/README.md b/configs/parta2/README.md index 1c35aa38a2..2d1d4b0825 100644 --- a/configs/parta2/README.md +++ b/configs/parta2/README.md @@ -1,12 +1,33 @@ # From Points to Parts: 3D Object Detection from Point Cloud with Part-aware and Part-aggregation Network -## Introduction +> [From Points to Parts: 3D Object Detection from Point Cloud with Part-aware and Part-aggregation Network](https://arxiv.org/abs/1907.03670) +## Abstract + +3D object detection from LiDAR point cloud is a challenging problem in 3D scene understanding and has many practical applications. In this paper, we extend our preliminary work PointRCNN to a novel and strong point-cloud-based 3D object detection framework, the part-aware and aggregation neural network (Part-A2 net). The whole framework consists of the part-aware stage and the part-aggregation stage. Firstly, the part-aware stage for the first time fully utilizes free-of-charge part supervisions derived from 3D ground-truth boxes to simultaneously predict high quality 3D proposals and accurate intra-object part locations. The predicted intra-object part locations within the same proposal are grouped by our new-designed RoI-aware point cloud pooling module, which results in an effective representation to encode the geometry-specific features of each 3D proposal. Then the part-aggregation stage learns to re-score the box and refine the box location by exploring the spatial relationship of the pooled intra-object part locations. Extensive experiments are conducted to demonstrate the performance improvements from each component of our proposed framework. Our Part-A2 net outperforms all existing 3D detection methods and achieves new state-of-the-art on KITTI 3D object detection dataset by utilizing only the LiDAR point cloud data. + +
+ +
+ +## Introduction + We implement Part-A^2 and provide its results and checkpoints on KITTI dataset. -``` +## Results and models + +### KITTI + +| Backbone |Class| Lr schd | Mem (GB) | Inf time (fps) | mAP | Download | +| :---------: | :-----: |:-----: | :------: | :------------: | :----: |:----: | +| [SECFPN](./hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class.py) |3 Class|cyclic 80e|4.1||67.9|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class_20200620_230724-a2672098.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class_20200620_230724.log.json)| +| [SECFPN](./hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car.py) |Car |cyclic 80e|4.0||79.16|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car_20200620_230755-f2a38b9a.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car_20200620_230755.log.json)| + +## Citation + +```latex @article{shi2020points, title={From points to parts: 3d object detection from point cloud with part-aware and part-aggregation network}, author={Shi, Shaoshuai and Wang, Zhe and Shi, Jianping and Wang, Xiaogang and Li, Hongsheng}, @@ -15,12 +36,3 @@ We implement Part-A^2 and provide its results and checkpoints on KITTI dataset. publisher={IEEE} } ``` - -## Results - -### KITTI - -| Backbone |Class| Lr schd | Mem (GB) | Inf time (fps) | mAP | Download | -| :---------: | :-----: |:-----: | :------: | :------------: | :----: |:----: | -| [SECFPN](./hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class.py) |3 Class|cyclic 80e|4.1||67.9|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class_20200620_230724-a2672098.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class_20200620_230724.log.json)| -| [SECFPN](./hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car.py) |Car |cyclic 80e|4.0||79.16|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car_20200620_230755-f2a38b9a.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car_20200620_230755.log.json)| diff --git a/configs/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car.py b/configs/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car.py index 91cf983d9c..89be085d84 100644 --- a/configs/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car.py +++ b/configs/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car.py @@ -10,7 +10,7 @@ _delete_=True, type='Anchor3DRangeGenerator', ranges=[[0, -40.0, -1.78, 70.4, 40.0, -1.78]], - sizes=[[1.6, 3.9, 1.56]], + sizes=[[3.9, 1.6, 1.56]], rotations=[0, 1.57], reshape_out=False)), roi_head=dict( diff --git a/configs/pgd/README.md b/configs/pgd/README.md new file mode 100644 index 0000000000..02e5aa5719 --- /dev/null +++ b/configs/pgd/README.md @@ -0,0 +1,69 @@ +# Probabilistic and Geometric Depth: Detecting Objects in Perspective + +> [Probabilistic and Geometric Depth: Detecting Objects in Perspective](https://arxiv.org/abs/2107.14160) + + + +## Abstract + +3D object detection is an important capability needed in various practical applications such as driver assistance systems. Monocular 3D detection, as a representative general setting among image-based approaches, provides a more economical solution than conventional settings relying on LiDARs but still yields unsatisfactory results. This paper first presents a systematic study on this problem. We observe that the current monocular 3D detection can be simplified as an instance depth estimation problem: The inaccurate instance depth blocks all the other 3D attribute predictions from improving the overall detection performance. Moreover, recent methods directly estimate the depth based on isolated instances or pixels while ignoring the geometric relations across different objects. To this end, we construct geometric relation graphs across predicted objects and use the graph to facilitate depth estimation. As the preliminary depth estimation of each instance is usually inaccurate in this ill-posed setting, we incorporate a probabilistic representation to capture the uncertainty. It provides an important indicator to identify confident predictions and further guide the depth propagation. Despite the simplicity of the basic idea, our method, PGD, obtains significant improvements on KITTI and nuScenes benchmarks, achieving 1st place out of all monocular vision-only methods while still maintaining real-time efficiency. Code and models will be released at [this https URL](https://github.com/open-mmlab/mmdetection3d). + +
+ +
+ +## Introduction + +PGD, also can be regarded as FCOS3D++, is a simple yet effective monocular 3D detector. It enhances the FCOS3D baseline by involving local geometric constraints and improving instance depth estimation. + +We release the code and model for both KITTI and nuScenes benchmark, which is a good supplement for the original FCOS3D baseline (only supported on nuScenes). + +For clean implementation, our preliminary release supports base models with proposed local geometric constraints and the probabilistic depth representation. We will involve the geometric graph part in the future. + +A more extensive study based on FCOS3D and PGD is on-going. Please stay tuned. + +## Results and models + +### KITTI + +| Backbone | Lr schd | Mem (GB) | Inf time (fps) | mAP_11 / mAP_40 | Download | +| :---------: | :-----: | :------: | :------------: | :----: | :------: | +|[ResNet101](./pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d.py)|4x|9.07||18.33 / 13.23|[model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d_20211022_102608-8a97533b.pth) | [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d_20211022_102608.log.json)| + +Detailed performance on KITTI 3D detection (3D/BEV) is as follows, evaluated by AP11 and AP40 metric: + +| | Easy | Moderate | Hard | +|-------------|:-------------:|:--------------:|:-------------:| +| Car (AP11) | 24.09 / 30.11 | 18.33 / 23.46 | 16.90 / 19.33 | +| Car (AP40) | 19.27 / 26.60 | 13.23 / 18.23 | 10.65 / 15.00 | + +Note: mAP represents Car moderate 3D strict AP11 / AP40 results. Because of the limited data for pedestrians and cyclists, the detection performance for these two classes is usually unstable. Therefore, we only list car detection results here. In addition, AP40 is a more recommended metric for reference due to its much better stability. + +### NuScenes + +| Backbone | Lr schd | Mem (GB) | mAP | NDS | Download | +| :---------: | :-----: | :------: | :----: |:----: | :------: | +|[ResNet101 w/ DCN](./pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d.py)|1x|9.20|31.7|39.3|[model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_20211116_195350-f4b5eec2.pth) | [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_20211116_195350.log.json)| +|[above w/ finetune](./pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_finetune.py)|1x|9.20|34.6|41.1|[model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_finetune/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_finetune_20211118_093245-fd419681.pth) | [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_finetune/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_finetune_20211118_093245.log.json)| +|above w/ tta|1x|9.20|35.5|41.8|| +|[ResNet101 w/ DCN](./pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d.py)|2x|9.20|33.6|40.9|[model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_20211112_125314-cb677266.pth) | [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_20211112_125314.log.json)| +|[above w/ finetune](./pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune.py)|2x|9.20|35.8|42.5|[model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune_20211114_162135-5ec7c1cd.pth) | [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune_20211114_162135.log.json)| +|above w/ tta|2x|9.20|36.8|43.1|| + +## Citation + +```latex +@inproceedings{wang2021pgd, + title={{Probabilistic and Geometric Depth: Detecting} Objects in Perspective}, + author={Wang, Tai and Zhu, Xinge and Pang, Jiangmiao and Lin, Dahua}, + booktitle={Conference on Robot Learning (CoRL) 2021}, + year={2021} +} +# For the baseline version +@inproceedings{wang2021fcos3d, + title={{FCOS3D: Fully} Convolutional One-Stage Monocular 3D Object Detection}, + author={Wang, Tai and Zhu, Xinge and Pang, Jiangmiao and Lin, Dahua}, + booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, + year={2021} +} +``` diff --git a/configs/pgd/metafile.yml b/configs/pgd/metafile.yml new file mode 100644 index 0000000000..d7d66265ed --- /dev/null +++ b/configs/pgd/metafile.yml @@ -0,0 +1,81 @@ +Collections: + - Name: PGD + Metadata: + Training Data: KITTI + Training Techniques: + - SGD + Training Resources: 4x TITAN XP + Architecture: + - PGDHead + Paper: + URL: https://arxiv.org/abs/2107.14160 + Title: 'Probabilistic and Geometric Depth: Detecting Objects in Perspective' + README: configs/pgd/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/mmdet3d/models/dense_heads/pgd_head.py#17 + Version: v1.0.0 + +Models: + - Name: pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d + In Collection: PGD + Config: configs/pgd/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d.py + Metadata: + Training Memory (GB): 9.1 + Results: + - Task: 3D Object Detection + Dataset: KITTI + Metrics: + mAP: 18.33 + Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d_20211022_102608-8a97533b.pth + + - Name: pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d + In Collection: PGD + Config: configs/pgd/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d.py + Metadata: + Training Memory (GB): 9.2 + Results: + - Task: 3D Object Detection + Dataset: nuScenes + Metrics: + mAP: 31.7 + NDS: 39.3 + Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_20211116_195350-f4b5eec2.pth + + - Name: pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_finetune + In Collection: PGD + Config: configs/pgd/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_finetune.py + Metadata: + Training Memory (GB): 9.2 + Results: + - Task: 3D Object Detection + Dataset: nuScenes + Metrics: + mAP: 34.6 + NDS: 41.1 + Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_finetune/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_finetune_20211118_093245-fd419681.pth + + - Name: pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d + In Collection: PGD + Config: configs/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d.py + Metadata: + Training Memory (GB): 9.2 + Results: + - Task: 3D Object Detection + Dataset: nuScenes + Metrics: + mAP: 33.6 + NDS: 40.9 + Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_20211112_125314-cb677266.pth + + - Name: pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune + In Collection: PGD + Config: configs/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune.py + Metadata: + Training Memory (GB): 9.2 + Results: + - Task: 3D Object Detection + Dataset: nuScenes + Metrics: + mAP: 35.8 + NDS: 42.5 + Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune_20211114_162135-5ec7c1cd.pth diff --git a/configs/pgd/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d.py b/configs/pgd/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d.py new file mode 100644 index 0000000000..37b5049313 --- /dev/null +++ b/configs/pgd/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d.py @@ -0,0 +1,107 @@ +_base_ = [ + '../_base_/datasets/nus-mono3d.py', '../_base_/models/pgd.py', + '../_base_/schedules/mmdet_schedule_1x.py', '../_base_/default_runtime.py' +] +# model settings +model = dict( + backbone=dict( + dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, False, True, True)), + bbox_head=dict( + pred_bbox2d=True, + group_reg_dims=(2, 1, 3, 1, 2, + 4), # offset, depth, size, rot, velo, bbox2d + reg_branch=( + (256, ), # offset + (256, ), # depth + (256, ), # size + (256, ), # rot + (), # velo + (256, ) # bbox2d + ), + loss_depth=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), + bbox_coder=dict( + type='PGDBBoxCoder', + base_depths=((31.99, 21.12), (37.15, 24.63), (39.69, 23.97), + (40.91, 26.34), (34.16, 20.11), (22.35, 13.70), + (24.28, 16.05), (27.26, 15.50), (20.61, 13.68), + (22.74, 15.01)), + base_dims=((4.62, 1.73, 1.96), (6.93, 2.83, 2.51), + (12.56, 3.89, 2.94), (11.22, 3.50, 2.95), + (6.68, 3.21, 2.85), (6.68, 3.21, 2.85), + (2.11, 1.46, 0.78), (0.73, 1.77, 0.67), + (0.41, 1.08, 0.41), (0.50, 0.99, 2.52)), + code_size=9)), + # set weight 1.0 for base 7 dims (offset, depth, size, rot) + # 0.05 for 2-dim velocity and 0.2 for 4-dim 2D distance targets + train_cfg=dict(code_weight=[ + 1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.05, 0.05, 0.2, 0.2, 0.2, 0.2 + ]), + test_cfg=dict(nms_pre=1000, nms_thr=0.8, score_thr=0.01, max_per_img=200)) + +class_names = [ + 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', + 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier' +] +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFileMono3D'), + dict( + type='LoadAnnotations3D', + with_bbox=True, + with_label=True, + with_attr_label=True, + with_bbox_3d=True, + with_label_3d=True, + with_bbox_depth=True), + dict(type='Resize', img_scale=(1600, 900), keep_ratio=True), + dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict( + type='Collect3D', + keys=[ + 'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d', + 'gt_labels_3d', 'centers2d', 'depths' + ]), +] +test_pipeline = [ + dict(type='LoadImageFromFileMono3D'), + dict( + type='MultiScaleFlipAug', + scale_factor=1.0, + flip=False, + transforms=[ + dict(type='RandomFlip3D'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# optimizer +optimizer = dict( + lr=0.004, paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.)) +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[8, 11]) +total_epochs = 12 +evaluation = dict(interval=4) +runner = dict(max_epochs=total_epochs) diff --git a/configs/pgd/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_finetune.py b/configs/pgd/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_finetune.py new file mode 100644 index 0000000000..f5d64232d3 --- /dev/null +++ b/configs/pgd/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_finetune.py @@ -0,0 +1,9 @@ +_base_ = './pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d.py' +# model settings +model = dict( + train_cfg=dict(code_weight=[ + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.05, 0.05, 0.2, 0.2, 0.2, 0.2 + ])) +# optimizer +optimizer = dict(lr=0.002) +load_from = 'work_dirs/pgd_nus_benchmark_1x/latest.pth' diff --git a/configs/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d.py b/configs/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d.py new file mode 100644 index 0000000000..2dd595753e --- /dev/null +++ b/configs/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d.py @@ -0,0 +1,5 @@ +_base_ = './pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d.py' +# learning policy +lr_config = dict(step=[16, 22]) +total_epochs = 24 +runner = dict(max_epochs=total_epochs) diff --git a/configs/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune.py b/configs/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune.py new file mode 100644 index 0000000000..19a3d630b8 --- /dev/null +++ b/configs/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune.py @@ -0,0 +1,9 @@ +_base_ = './pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d.py' +# model settings +model = dict( + train_cfg=dict(code_weight=[ + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.05, 0.05, 0.2, 0.2, 0.2, 0.2 + ])) +# optimizer +optimizer = dict(lr=0.002) +load_from = 'work_dirs/pgd_nus_benchmark_2x/latest.pth' diff --git a/configs/pgd/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d.py b/configs/pgd/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d.py new file mode 100644 index 0000000000..832b34e64d --- /dev/null +++ b/configs/pgd/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d.py @@ -0,0 +1,127 @@ +_base_ = [ + '../_base_/datasets/kitti-mono3d.py', '../_base_/models/pgd.py', + '../_base_/schedules/mmdet_schedule_1x.py', '../_base_/default_runtime.py' +] +# model settings +model = dict( + backbone=dict(frozen_stages=0), + neck=dict(start_level=0, num_outs=4), + bbox_head=dict( + num_classes=3, + bbox_code_size=7, + pred_attrs=False, + pred_velo=False, + pred_bbox2d=True, + use_onlyreg_proj=True, + strides=(4, 8, 16, 32), + regress_ranges=((-1, 64), (64, 128), (128, 256), (256, 1e8)), + group_reg_dims=(2, 1, 3, 1, 16, + 4), # offset, depth, size, rot, kpts, bbox2d + reg_branch=( + (256, ), # offset + (256, ), # depth + (256, ), # size + (256, ), # rot + (256, ), # kpts + (256, ) # bbox2d + ), + centerness_branch=(256, ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), + loss_dir=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + use_depth_classifier=True, + depth_branch=(256, ), + depth_range=(0, 70), + depth_unit=10, + division='uniform', + depth_bins=8, + pred_keypoints=True, + weight_dim=1, + loss_depth=dict( + type='UncertainSmoothL1Loss', alpha=1.0, beta=3.0, + loss_weight=1.0), + bbox_coder=dict( + type='PGDBBoxCoder', + base_depths=((28.01, 16.32), ), + base_dims=((0.8, 1.73, 0.6), (1.76, 1.73, 0.6), (3.9, 1.56, 1.6)), + code_size=7)), + # set weight 1.0 for base 7 dims (offset, depth, size, rot) + # 0.2 for 16-dim keypoint offsets and 1.0 for 4-dim 2D distance targets + train_cfg=dict(code_weight=[ + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, + 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0 + ]), + test_cfg=dict(nms_pre=100, nms_thr=0.05, score_thr=0.001, max_per_img=20)) + +class_names = ['Pedestrian', 'Cyclist', 'Car'] +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFileMono3D'), + dict( + type='LoadAnnotations3D', + with_bbox=True, + with_label=True, + with_attr_label=False, + with_bbox_3d=True, + with_label_3d=True, + with_bbox_depth=True), + dict(type='Resize', img_scale=(1242, 375), keep_ratio=True), + dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict( + type='Collect3D', + keys=[ + 'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_3d', 'gt_labels_3d', + 'centers2d', 'depths' + ]), +] +test_pipeline = [ + dict(type='LoadImageFromFileMono3D'), + dict( + type='MultiScaleFlipAug', + scale_factor=1.0, + flip=False, + transforms=[ + dict(type='RandomFlip3D'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=3, + workers_per_gpu=3, + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# optimizer +optimizer = dict( + lr=0.001, paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.)) +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[32, 44]) +total_epochs = 48 +runner = dict(type='EpochBasedRunner', max_epochs=48) +evaluation = dict(interval=2) +checkpoint_config = dict(interval=8) diff --git a/configs/point_rcnn/README.md b/configs/point_rcnn/README.md new file mode 100644 index 0000000000..cf59e3f6c5 --- /dev/null +++ b/configs/point_rcnn/README.md @@ -0,0 +1,47 @@ +# PointRCNN: 3D Object Proposal Generation and Detection from Point Cloud + +> [PointRCNN: 3D Object Proposal Generation and Detection from Point Cloud](https://arxiv.org/abs/1812.04244) + + + +## Abstract + +In this paper, we propose PointRCNN for 3D object detection from raw point cloud. The whole framework is composed of two stages: stage-1 for the bottom-up 3D proposal generation and stage-2 for refining proposals in the canonical coordinates to obtain the final detection results. Instead of generating proposals from RGB image or projecting point cloud to bird's view or voxels as previous methods do, our stage-1 sub-network directly generates a small number of high-quality 3D proposals from point cloud in a bottom-up manner via segmenting the point cloud of the whole scene into foreground points and background. The stage-2 sub-network transforms the pooled points of each proposal to canonical coordinates to learn better local spatial features, which is combined with global semantic features of each point learned in stage-1 for accurate box refinement and confidence prediction. Extensive experiments on the 3D detection benchmark of KITTI dataset show that our proposed architecture outperforms state-of-the-art methods with remarkable margins by using only point cloud as input. + +
+ +
+ +## Introduction + +We implement PointRCNN and provide the result with checkpoints on KITTI dataset. + +## Results and models + +### KITTI + +| Backbone |Class| Lr schd | Mem (GB) | Inf time (fps) | mAP | Download | +| :---------: | :-----: |:-----: | :------: | :------------: | :----: |:----: | +| [PointNet++](./point_rcnn_2x8_kitti-3d-3classes.py) |3 Class|cyclic 40e|4.6||70.83|[model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/point_rcnn/point_rcnn_2x8_kitti-3d-3classes_20211208_151344.pth) | [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/point_rcnn/point_rcnn_2x8_kitti-3d-3classes_20211208_151344.log.json)| + +Note: mAP represents AP11 results on 3 Class under the moderate setting. + +Detailed performance on KITTI 3D detection (3D) is as follows, evaluated by AP11 metric: + +| | Easy | Moderate | Hard | +|-------------|:-------------:|:--------------:|:------------:| +| Car | 89.13 | 78.72 | 78.24 | +| Pedestrian | 65.81 | 59.57 | 52.75 | +| Cyclist | 93.51 | 74.19 | 70.73 | + +## Citation + +```latex +@inproceedings{Shi_2019_CVPR, + title = {PointRCNN: 3D Object Proposal Generation and Detection from Point Cloud}, + author = {Shi, Shaoshuai and Wang, Xiaogang and Li, Hongsheng}, + booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + year = {2019} +} +``` diff --git a/configs/point_rcnn/metafile.yml b/configs/point_rcnn/metafile.yml new file mode 100644 index 0000000000..a7627cee6b --- /dev/null +++ b/configs/point_rcnn/metafile.yml @@ -0,0 +1,29 @@ +Collections: + - Name: PointRCNN + Metadata: + Training Data: KITTI + Training Techniques: + - AdamW + Training Resources: 8x Titan XP GPUs + Architecture: + - PointNet++ + Paper: + URL: https://arxiv.org/abs/1812.04244 + Title: 'PointRCNN: 3D Object Proposal Generation and Detection from Point Cloud' + README: configs/point_rcnn/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/mmdet3d/models/detectors/point_rcnn.py#L8 + Version: v1.0.0 + +Models: + - Name: point_rcnn_2x8_kitti-3d-3classes.py + In Collection: PointRCNN + Config: configs/point_rcnn/point_rcnn_2x8_kitti-3d-3classes.py + Metadata: + Training Memory (GB): 4.6 + Results: + - Task: 3D Object Detection + Dataset: KITTI + Metrics: + mAP: 70.83 + Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/point_rcnn/point_rcnn_2x8_kitti-3d-3classes_20211208_151344.pth diff --git a/configs/point_rcnn/point_rcnn_2x8_kitti-3d-3classes.py b/configs/point_rcnn/point_rcnn_2x8_kitti-3d-3classes.py new file mode 100644 index 0000000000..1344aca5c5 --- /dev/null +++ b/configs/point_rcnn/point_rcnn_2x8_kitti-3d-3classes.py @@ -0,0 +1,94 @@ +_base_ = [ + '../_base_/datasets/kitti-3d-car.py', '../_base_/models/point_rcnn.py', + '../_base_/default_runtime.py', '../_base_/schedules/cyclic_40e.py' +] + +# dataset settings +dataset_type = 'KittiDataset' +data_root = 'data/kitti/' +class_names = ['Car', 'Pedestrian', 'Cyclist'] +point_cloud_range = [0, -40, -3, 70.4, 40, 1] +input_modality = dict(use_lidar=True, use_camera=False) + +db_sampler = dict( + data_root=data_root, + info_path=data_root + 'kitti_dbinfos_train.pkl', + rate=1.0, + prepare=dict( + filter_by_difficulty=[-1], + filter_by_min_points=dict(Car=5, Pedestrian=5, Cyclist=5)), + sample_groups=dict(Car=20, Pedestrian=15, Cyclist=15), + classes=class_names) + +train_pipeline = [ + dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectSample', db_sampler=db_sampler), + dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), + dict( + type='ObjectNoise', + num_try=100, + translation_std=[1.0, 1.0, 0.5], + global_rot_range=[0.0, 0.0], + rot_range=[-0.78539816, 0.78539816]), + dict( + type='GlobalRotScaleTrans', + rot_range=[-0.78539816, 0.78539816], + scale_ratio_range=[0.95, 1.05]), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='PointSample', num_points=16384, sample_range=40.0), + dict(type='PointShuffle'), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) +] +test_pipeline = [ + dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='GlobalRotScaleTrans', + rot_range=[0, 0], + scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D'), + dict( + type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='PointSample', num_points=16384, sample_range=40.0), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) + ]) +] + +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type='RepeatDataset', + times=2, + dataset=dict(pipeline=train_pipeline, classes=class_names)), + val=dict(pipeline=test_pipeline, classes=class_names), + test=dict(pipeline=test_pipeline, classes=class_names)) + +# optimizer +lr = 0.001 # max learning rate +optimizer = dict(lr=lr, betas=(0.95, 0.85)) +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=80) +evaluation = dict(interval=2) +# yapf:disable +log_config = dict( + interval=30, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) +# yapf:enable diff --git a/configs/pointnet2/README.md b/configs/pointnet2/README.md index fc6978bf8e..e91c23f013 100644 --- a/configs/pointnet2/README.md +++ b/configs/pointnet2/README.md @@ -1,24 +1,24 @@ # PointNet++: Deep Hierarchical Feature Learning on Point Sets in a Metric Space -## Introduction +> [PointNet++: Deep Hierarchical Feature Learning on Point Sets in a Metric Space](https://arxiv.org/abs/1706.02413) -We implement PointNet++ and provide the result and checkpoints on ScanNet and S3DIS datasets. +## Abstract -``` -@inproceedings{qi2017pointnet++, - title={PointNet++ deep hierarchical feature learning on point sets in a metric space}, - author={Qi, Charles R and Yi, Li and Su, Hao and Guibas, Leonidas J}, - booktitle={Proceedings of the 31st International Conference on Neural Information Processing Systems}, - pages={5105--5114}, - year={2017} -} -``` +Few prior works study deep learning on point sets. PointNet by Qi et al. is a pioneer in this direction. However, by design PointNet does not capture local structures induced by the metric space points live in, limiting its ability to recognize fine-grained patterns and generalizability to complex scenes. In this work, we introduce a hierarchical neural network that applies PointNet recursively on a nested partitioning of the input point set. By exploiting metric space distances, our network is able to learn local features with increasing contextual scales. With further observation that point sets are usually sampled with varying densities, which results in greatly decreased performance for networks trained on uniform densities, we propose novel set learning layers to adaptively combine features from multiple scales. Experiments show that our network called PointNet++ is able to learn deep point set features efficiently and robustly. In particular, results significantly better than state-of-the-art have been obtained on challenging benchmarks of 3D point clouds. + +
+ +
+ +## Introduction + +We implement PointNet++ and provide the result and checkpoints on ScanNet and S3DIS datasets. **Notice**: The original PointNet++ paper used step learning rate schedule. We discovered that cosine schedule achieves much better results and adopt it in our implementations. We also use a larger `weight_decay` factor because we find it consistently improves the performance. -## Results +## Results and models ### ScanNet @@ -56,3 +56,15 @@ We implement PointNet++ and provide the result and checkpoints on ScanNet and S3 ## Indeterminism Since PointNet++ testing adopts sliding patch inference which involves random point sampling, and the test script uses fixed random seeds while the random seeds of validation in training are not fixed, the test results may be slightly different from the results reported above. + +## Citation + +```latex +@inproceedings{qi2017pointnet++, + title={PointNet++ deep hierarchical feature learning on point sets in a metric space}, + author={Qi, Charles R and Yi, Li and Su, Hao and Guibas, Leonidas J}, + booktitle={Proceedings of the 31st International Conference on Neural Information Processing Systems}, + pages={5105--5114}, + year={2017} +} +``` diff --git a/configs/pointpillars/README.md b/configs/pointpillars/README.md index b0dae78187..ff817621bd 100644 --- a/configs/pointpillars/README.md +++ b/configs/pointpillars/README.md @@ -1,23 +1,22 @@ # PointPillars: Fast Encoders for Object Detection from Point Clouds -## Introduction +> [PointPillars: Fast Encoders for Object Detection from Point Clouds](https://arxiv.org/abs/1812.05784) -We implement PointPillars and provide the results and checkpoints on KITTI, nuScenes, Lyft and Waymo datasets. +## Abstract -``` -@inproceedings{lang2019pointpillars, - title={Pointpillars: Fast encoders for object detection from point clouds}, - author={Lang, Alex H and Vora, Sourabh and Caesar, Holger and Zhou, Lubing and Yang, Jiong and Beijbom, Oscar}, - booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition}, - pages={12697--12705}, - year={2019} -} +Object detection in point clouds is an important aspect of many robotics applications such as autonomous driving. In this paper we consider the problem of encoding a point cloud into a format appropriate for a downstream detection pipeline. Recent literature suggests two types of encoders; fixed encoders tend to be fast but sacrifice accuracy, while encoders that are learned from data are more accurate, but slower. In this work we propose PointPillars, a novel encoder which utilizes PointNets to learn a representation of point clouds organized in vertical columns (pillars). While the encoded features can be used with any standard 2D convolutional detection architecture, we further propose a lean downstream network. Extensive experimentation shows that PointPillars outperforms previous encoders with respect to both speed and accuracy by a large margin. Despite only using lidar, our full detection pipeline significantly outperforms the state of the art, even among fusion methods, with respect to both the 3D and bird's eye view KITTI benchmarks. This detection performance is achieved while running at 62 Hz: a 2 - 4 fold runtime improvement. A faster version of our method matches the state of the art at 105 Hz. These benchmarks suggest that PointPillars is an appropriate encoding for object detection in point clouds. -``` +
+ +
-## Results +## Introduction + +We implement PointPillars and provide the results and checkpoints on KITTI, nuScenes, Lyft and Waymo datasets. + +## Results and models ### KITTI @@ -31,7 +30,9 @@ We implement PointPillars and provide the results and checkpoints on KITTI, nuSc | Backbone | Lr schd | Mem (GB) | Inf time (fps) | mAP |NDS| Download | | :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: | |[SECFPN](./hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d.py)|2x|16.4||35.17|49.7|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230725-0817d270.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230725.log.json)| +|[SECFPN (FP16)](./hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d.py)|2x|8.37||35.19|50.27|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d_20201020_222626-c3f0483e.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d_20201020_222626.log.json)| |[FPN](./hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d.py)|2x|16.4||40.0|53.3|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d_20200620_230405-2fa62f3d.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d_20200620_230405.log.json)| +|[FPN (FP16)](./hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d.py)|2x|8.40||39.26|53.26|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d_20201021_120719-269f9dd6.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d_20201021_120719.log.json)| ### Lyft @@ -62,3 +63,16 @@ We implement PointPillars and provide the results and checkpoints on KITTI, nuSc - **Implementation Details**: We basically follow the implementation in the [paper](https://arxiv.org/pdf/1912.04838.pdf) in terms of the network architecture (having a stride of 1 for the first convolutional block). Different settings of voxelization, data augmentation and hyper parameters make these baselines outperform those in the paper by about 7 mAP for car and 4 mAP for pedestrian with only a subset of the whole dataset. All of these results are achieved without bells-and-whistles, e.g. ensemble, multi-scale training and test augmentation. - **License Aggrement**: To comply the [license agreement of Waymo dataset](https://waymo.com/open/terms/), the pre-trained models on Waymo dataset are not released. We still release the training log as a reference to ease the future research. +- `FP16` means Mixed Precision (FP16) is adopted in training. With mixed precision training, we can train PointPillars with nuScenes dataset on 8 Titan XP GPUS with batch size of 2. This will cause OOM error without mixed precision training. The loss scale for PointPillars on nuScenes dataset is specifically tuned to avoid the loss to be Nan. We find 32 is more stable than 512, though loss scale 32 still cause Nan sometimes. + +## Citation + +```latex +@inproceedings{lang2019pointpillars, + title={Pointpillars: Fast encoders for object detection from point clouds}, + author={Lang, Alex H and Vora, Sourabh and Caesar, Holger and Zhou, Lubing and Yang, Jiong and Beijbom, Oscar}, + booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition}, + pages={12697--12705}, + year={2019} +} +``` diff --git a/configs/fp16/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d.py b/configs/pointpillars/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d.py similarity index 66% rename from configs/fp16/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d.py rename to configs/pointpillars/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d.py index bf91f543b0..9764aa3308 100644 --- a/configs/fp16/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d.py +++ b/configs/pointpillars/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d.py @@ -1,4 +1,4 @@ -_base_ = '../pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d.py' +_base_ = './hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d.py' data = dict(samples_per_gpu=2, workers_per_gpu=2) # fp16 settings, the loss scale is specifically tuned to avoid Nan fp16 = dict(loss_scale=32.) diff --git a/configs/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class.py b/configs/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class.py index 5c5c939de5..2611e86d3a 100644 --- a/configs/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class.py +++ b/configs/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class.py @@ -15,21 +15,15 @@ rate=1.0, prepare=dict( filter_by_difficulty=[-1], - filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)), + filter_by_min_points=dict(Car=5, Pedestrian=5, Cyclist=5)), classes=class_names, - sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10)) + sample_groups=dict(Car=15, Pedestrian=15, Cyclist=15)) # PointPillars uses different augmentation hyper parameters train_pipeline = [ dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), - dict(type='ObjectSample', db_sampler=db_sampler), - dict( - type='ObjectNoise', - num_try=100, - translation_std=[0.25, 0.25, 0.25], - global_rot_range=[0.0, 0.0], - rot_range=[-0.15707963267, 0.15707963267]), + dict(type='ObjectSample', db_sampler=db_sampler, use_ground_plane=False), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict( type='GlobalRotScaleTrans', diff --git a/configs/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py b/configs/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py index 1e0f0faf9b..50b89d6aae 100644 --- a/configs/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py +++ b/configs/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py @@ -10,7 +10,7 @@ _delete_=True, type='Anchor3DRangeGenerator', ranges=[[0, -39.68, -1.78, 69.12, 39.68, -1.78]], - sizes=[[1.6, 3.9, 1.56]], + sizes=[[3.9, 1.6, 1.56]], rotations=[0, 1.57], reshape_out=True)), # model training and testing settings diff --git a/configs/pointpillars/hv_pointpillars_secfpn_sbn-all_2x8_2x_lyft-3d.py b/configs/pointpillars/hv_pointpillars_secfpn_sbn-all_2x8_2x_lyft-3d.py index 46d7b06723..1a0400eb33 100644 --- a/configs/pointpillars/hv_pointpillars_secfpn_sbn-all_2x8_2x_lyft-3d.py +++ b/configs/pointpillars/hv_pointpillars_secfpn_sbn-all_2x8_2x_lyft-3d.py @@ -29,15 +29,15 @@ [-80, -80, -0.9122268, 80, 80, -0.9122268], [-80, -80, -1.8012227, 80, 80, -1.8012227]], sizes=[ - [1.92, 4.75, 1.71], # car - [2.84, 10.24, 3.44], # truck - [2.92, 12.70, 3.42], # bus - [2.42, 6.52, 2.34], # emergency vehicle - [2.75, 8.17, 3.20], # other vehicle - [0.96, 2.35, 1.59], # motorcycle - [0.63, 1.76, 1.44], # bicycle - [0.76, 0.80, 1.76], # pedestrian - [0.35, 0.73, 0.50] # animal + [4.75, 1.92, 1.71], # car + [10.24, 2.84, 3.44], # truck + [12.70, 2.92, 3.42], # bus + [6.52, 2.42, 2.34], # emergency vehicle + [8.17, 2.75, 3.20], # other vehicle + [2.35, 0.96, 1.59], # motorcycle + [1.76, 0.63, 1.44], # bicycle + [0.80, 0.76, 1.76], # pedestrian + [0.73, 0.35, 0.50] # animal ], rotations=[0, 1.57], reshape_out=True))) diff --git a/configs/pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d.py b/configs/pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d.py index 868c7ff8c9..afff99c630 100644 --- a/configs/pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d.py +++ b/configs/pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d.py @@ -29,13 +29,13 @@ [-49.6, -49.6, -1.763965, 49.6, 49.6, -1.763965], ], sizes=[ - [1.95017717, 4.60718145, 1.72270761], # car - [2.4560939, 6.73778078, 2.73004906], # truck - [2.87427237, 12.01320693, 3.81509561], # trailer - [0.60058911, 1.68452161, 1.27192197], # bicycle - [0.66344886, 0.7256437, 1.75748069], # pedestrian - [0.39694519, 0.40359262, 1.06232151], # traffic_cone - [2.49008838, 0.48578221, 0.98297065], # barrier + [4.60718145, 1.95017717, 1.72270761], # car + [6.73778078, 2.4560939, 2.73004906], # truck + [12.01320693, 2.87427237, 3.81509561], # trailer + [1.68452161, 0.60058911, 1.27192197], # bicycle + [0.7256437, 0.66344886, 1.75748069], # pedestrian + [0.40359262, 0.39694519, 1.06232151], # traffic_cone + [0.48578221, 2.49008838, 0.98297065], # barrier ], custom_values=[0, 0], rotations=[0, 1.57], diff --git a/configs/fp16/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d.py b/configs/pointpillars/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d.py similarity index 65% rename from configs/fp16/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d.py rename to configs/pointpillars/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d.py index 0bbbb6c6c0..ff0f67a043 100644 --- a/configs/fp16/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d.py +++ b/configs/pointpillars/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d.py @@ -1,4 +1,4 @@ -_base_ = '../pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d.py' +_base_ = './hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d.py' data = dict(samples_per_gpu=2, workers_per_gpu=2) # fp16 settings, the loss scale is specifically tuned to avoid Nan fp16 = dict(loss_scale=32.) diff --git a/configs/pointpillars/hv_pointpillars_secfpn_sbn-all_range100_2x8_2x_lyft-3d.py b/configs/pointpillars/hv_pointpillars_secfpn_sbn-all_range100_2x8_2x_lyft-3d.py index fa18aca2c3..7964b79982 100644 --- a/configs/pointpillars/hv_pointpillars_secfpn_sbn-all_range100_2x8_2x_lyft-3d.py +++ b/configs/pointpillars/hv_pointpillars_secfpn_sbn-all_range100_2x8_2x_lyft-3d.py @@ -28,15 +28,15 @@ [-100, -100, -0.9122268, 100, 100, -0.9122268], [-100, -100, -1.8012227, 100, 100, -1.8012227]], sizes=[ - [1.92, 4.75, 1.71], # car - [2.84, 10.24, 3.44], # truck - [2.92, 12.70, 3.42], # bus - [2.42, 6.52, 2.34], # emergency vehicle - [2.75, 8.17, 3.20], # other vehicle - [0.96, 2.35, 1.59], # motorcycle - [0.63, 1.76, 1.44], # bicycle - [0.76, 0.80, 1.76], # pedestrian - [0.35, 0.73, 0.50] # animal + [4.75, 1.92, 1.71], # car + [10.24, 2.84, 3.44], # truck + [12.70, 2.92, 3.42], # bus + [6.52, 2.42, 2.34], # emergency vehicle + [8.17, 2.75, 3.20], # other vehicle + [2.35, 0.96, 1.59], # motorcycle + [1.76, 0.63, 1.44], # bicycle + [0.80, 0.76, 1.76], # pedestrian + [0.73, 0.35, 0.50] # animal ], rotations=[0, 1.57], reshape_out=True))) diff --git a/configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymo-3d-car.py b/configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymo-3d-car.py index aeac750d9e..90f2a42c53 100644 --- a/configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymo-3d-car.py +++ b/configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymo-3d-car.py @@ -17,7 +17,7 @@ anchor_generator=dict( type='AlignedAnchor3DRangeGenerator', ranges=[[-74.88, -74.88, -0.0345, 74.88, 74.88, -0.0345]], - sizes=[[2.08, 4.73, 1.77]], + sizes=[[4.73, 2.08, 1.77]], rotations=[0, 1.57], reshape_out=True)), # model training and testing settings diff --git a/configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-car.py b/configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-car.py index 1fe32fd404..3a3e326698 100644 --- a/configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-car.py +++ b/configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-car.py @@ -14,7 +14,7 @@ anchor_generator=dict( type='AlignedAnchor3DRangeGenerator', ranges=[[-74.88, -74.88, -0.0345, 74.88, 74.88, -0.0345]], - sizes=[[2.08, 4.73, 1.77]], + sizes=[[4.73, 2.08, 1.77]], rotations=[0, 1.57], reshape_out=True)), # model training and testing settings diff --git a/configs/pointpillars/metafile.yml b/configs/pointpillars/metafile.yml index 581efb127b..a7e419d7f6 100644 --- a/configs/pointpillars/metafile.yml +++ b/configs/pointpillars/metafile.yml @@ -167,3 +167,47 @@ Models: mAPH@L1: 63.3 mAP@L2: 62.6 mAPH@L2: 57.6 + + - Name: hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d + In Collection: PointPillars + Config: configs/pointpillars/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d.py + Metadata: + Training Techniques: + - AdamW + - Mixed Precision Training + Training Resources: 8x TITAN Xp + Architecture: + - Hard Voxelization + Training Data: nuScenes + Training Memory (GB): 8.37 + Results: + - Task: 3D Object Detection + Dataset: nuScenes + Metrics: + mAP: 35.19 + NDS: 50.27 + Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d_20201020_222626-c3f0483e.pth + Code: + Version: v0.7.0 + + - Name: hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d + In Collection: PointPillars + Config: configs/pointpillars/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d.py + Metadata: + Training Techniques: + - AdamW + - Mixed Precision Training + Training Resources: 8x TITAN Xp + Architecture: + - Hard Voxelization + Training Data: nuScenes + Training Memory (GB): 8.40 + Results: + - Task: 3D Object Detection + Dataset: nuScenes + Metrics: + mAP: 39.26 + NDS: 53.26 + Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d_20201021_120719-269f9dd6.pth + Code: + Version: v0.7.0 diff --git a/configs/regnet/README.md b/configs/regnet/README.md index 3e74a37a82..f981123937 100644 --- a/configs/regnet/README.md +++ b/configs/regnet/README.md @@ -1,24 +1,23 @@ # Designing Network Design Spaces -## Introduction +> [Designing Network Design Spaces](https://arxiv.org/abs/2003.13678) +## Abstract + +In this work, we present a new network design paradigm. Our goal is to help advance the understanding of network design and discover design principles that generalize across settings. Instead of focusing on designing individual network instances, we design network design spaces that parametrize populations of networks. The overall process is analogous to classic manual design of networks, but elevated to the design space level. Using our methodology we explore the structure aspect of network design and arrive at a low-dimensional design space consisting of simple, regular networks that we call RegNet. The core insight of the RegNet parametrization is surprisingly simple: widths and depths of good networks can be explained by a quantized linear function. We analyze the RegNet design space and arrive at interesting findings that do not match the current practice of network design. The RegNet design space provides simple and fast networks that work well across a wide range of flop regimes. Under comparable training settings and flops, the RegNet models outperform the popular EfficientNet models while being up to 5x faster on GPUs. + +
+ +
+ +## Introduction + We implement RegNetX models in 3D detection systems and provide their first results with PointPillars on nuScenes and Lyft dataset. The pre-trained modles are converted from [model zoo of pycls](https://github.com/facebookresearch/pycls/blob/master/MODEL_ZOO.md) and maintained in [mmcv](https://github.com/open-mmlab/mmcv). -``` -@article{radosavovic2020designing, - title={Designing Network Design Spaces}, - author={Ilija Radosavovic and Raj Prateek Kosaraju and Ross Girshick and Kaiming He and Piotr Dollár}, - year={2020}, - eprint={2003.13678}, - archivePrefix={arXiv}, - primaryClass={cs.CV} -} -``` - ## Usage To use a regnet model, there are two steps to do: @@ -47,7 +46,7 @@ For other pre-trained models or self-implemented regnet models, the users are re **Note**: Although Fig. 15 & 16 also provide `w0`, `wa`, `wm`, `group_w`, and `bot_mul` for `arch`, they are quantized thus inaccurate, using them sometimes produces different backbone that does not match the key in the pre-trained model. -## Results +## Results and models ### nuScenes @@ -67,3 +66,16 @@ For other pre-trained models or self-implemented regnet models, the users are re |[RegNetX-400MF-SECFPN](./hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_lyft-3d.py)| 2x |15.9||14.9|15.1|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_regnet-400mf_secfpn_sbn-all_2x8_2x_lyft-3d_20210524_092151-42513826.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_regnet-400mf_secfpn_sbn-all_2x8_2x_lyft-3d_20210524_092151.log.json)| |[FPN](../pointpillars/hv_pointpillars_fpn_sbn-all_2x8_2x_lyft-3d.py)|2x|9.2||14.9|15.1|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_fpn_sbn-all_2x8_2x_lyft-3d_20210517_202818-fc6904c3.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_fpn_sbn-all_2x8_2x_lyft-3d_20210517_202818.log.json)| |[RegNetX-400MF-FPN](./hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_lyft-3d.py)|2x|13.0||16.0|16.1|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_regnet-400mf_fpn_sbn-all_2x8_2x_lyft-3d_20210521_115618-823dcf18.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_regnet-400mf_fpn_sbn-all_2x8_2x_lyft-3d_20210521_115618.log.json)| + +## Citation + +```latex +@article{radosavovic2020designing, + title={Designing Network Design Spaces}, + author={Ilija Radosavovic and Raj Prateek Kosaraju and Ross Girshick and Kaiming He and Piotr Dollár}, + year={2020}, + eprint={2003.13678}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` diff --git a/configs/fp16/hv_pointpillars_regnet-400mf_fpn_sbn-all_fp16_2x8_2x_nus-3d.py b/configs/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_fp16_2x8_2x_nus-3d.py similarity index 64% rename from configs/fp16/hv_pointpillars_regnet-400mf_fpn_sbn-all_fp16_2x8_2x_nus-3d.py rename to configs/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_fp16_2x8_2x_nus-3d.py index 76d13fdd57..e586365264 100644 --- a/configs/fp16/hv_pointpillars_regnet-400mf_fpn_sbn-all_fp16_2x8_2x_nus-3d.py +++ b/configs/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_fp16_2x8_2x_nus-3d.py @@ -1,4 +1,4 @@ -_base_ = '../regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d.py' +_base_ = './hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d.py' data = dict(samples_per_gpu=2, workers_per_gpu=2) # fp16 settings, the loss scale is specifically tuned to avoid Nan fp16 = dict(loss_scale=32.) diff --git a/configs/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_2x8_2x_lyft-3d.py b/configs/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_2x8_2x_lyft-3d.py index f1a3f4d828..fb330d7855 100644 --- a/configs/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_2x8_2x_lyft-3d.py +++ b/configs/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_2x8_2x_lyft-3d.py @@ -25,15 +25,15 @@ [-80, -80, -0.9122268, 80, 80, -0.9122268], [-80, -80, -1.8012227, 80, 80, -1.8012227]], sizes=[ - [1.92, 4.75, 1.71], # car - [2.84, 10.24, 3.44], # truck - [2.92, 12.70, 3.42], # bus - [2.42, 6.52, 2.34], # emergency vehicle - [2.75, 8.17, 3.20], # other vehicle - [0.96, 2.35, 1.59], # motorcycle - [0.63, 1.76, 1.44], # bicycle - [0.76, 0.80, 1.76], # pedestrian - [0.35, 0.73, 0.50] # animal + [4.75, 1.92, 1.71], # car + [10.24, 2.84, 3.44], # truck + [12.70, 2.92, 3.42], # bus + [6.52, 2.42, 2.34], # emergency vehicle + [8.17, 2.75, 3.20], # other vehicle + [2.35, 0.96, 1.59], # motorcycle + [1.76, 0.63, 1.44], # bicycle + [0.80, 0.76, 1.76], # pedestrian + [0.73, 0.35, 0.50] # animal ], rotations=[0, 1.57], reshape_out=True))) diff --git a/configs/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d.py b/configs/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d.py index 0f9e031f90..ef8996a18a 100644 --- a/configs/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d.py +++ b/configs/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d.py @@ -25,13 +25,13 @@ [-49.6, -49.6, -1.763965, 49.6, 49.6, -1.763965], ], sizes=[ - [1.95017717, 4.60718145, 1.72270761], # car - [2.4560939, 6.73778078, 2.73004906], # truck - [2.87427237, 12.01320693, 3.81509561], # trailer - [0.60058911, 1.68452161, 1.27192197], # bicycle - [0.66344886, 0.7256437, 1.75748069], # pedestrian - [0.39694519, 0.40359262, 1.06232151], # traffic_cone - [2.49008838, 0.48578221, 0.98297065], # barrier + [4.60718145, 1.95017717, 1.72270761], # car + [6.73778078, 2.4560939, 2.73004906], # truck + [12.01320693, 2.87427237, 3.81509561], # trailer + [1.68452161, 0.60058911, 1.27192197], # bicycle + [0.7256437, 0.66344886, 1.75748069], # pedestrian + [0.40359262, 0.39694519, 1.06232151], # traffic_cone + [0.48578221, 2.49008838, 0.98297065], # barrier ], custom_values=[0, 0], rotations=[0, 1.57], diff --git a/configs/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_range100_2x8_2x_lyft-3d.py b/configs/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_range100_2x8_2x_lyft-3d.py index c7bc8f16eb..2af3719c9b 100644 --- a/configs/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_range100_2x8_2x_lyft-3d.py +++ b/configs/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_range100_2x8_2x_lyft-3d.py @@ -26,15 +26,15 @@ [-100, -100, -0.9122268, 100, 100, -0.9122268], [-100, -100, -1.8012227, 100, 100, -1.8012227]], sizes=[ - [1.92, 4.75, 1.71], # car - [2.84, 10.24, 3.44], # truck - [2.92, 12.70, 3.42], # bus - [2.42, 6.52, 2.34], # emergency vehicle - [2.75, 8.17, 3.20], # other vehicle - [0.96, 2.35, 1.59], # motorcycle - [0.63, 1.76, 1.44], # bicycle - [0.76, 0.80, 1.76], # pedestrian - [0.35, 0.73, 0.50] # animal + [4.75, 1.92, 1.71], # car + [10.24, 2.84, 3.44], # truck + [12.70, 2.92, 3.42], # bus + [6.52, 2.42, 2.34], # emergency vehicle + [8.17, 2.75, 3.20], # other vehicle + [2.35, 0.96, 1.59], # motorcycle + [1.76, 0.63, 1.44], # bicycle + [0.80, 0.76, 1.76], # pedestrian + [0.73, 0.35, 0.50] # animal ], rotations=[0, 1.57], reshape_out=True))) diff --git a/configs/second/README.md b/configs/second/README.md index a79c0fdf2b..6180656e07 100644 --- a/configs/second/README.md +++ b/configs/second/README.md @@ -1,29 +1,31 @@ # Second: Sparsely embedded convolutional detection -## Introduction +> [SECOND: Sparsely Embedded Convolutional Detection](https://www.mdpi.com/1424-8220/18/10/3337) -We implement SECOND and provide the results and checkpoints on KITTI dataset. +## Abstract -``` -@article{yan2018second, - title={Second: Sparsely embedded convolutional detection}, - author={Yan, Yan and Mao, Yuxing and Li, Bo}, - journal={Sensors}, - year={2018}, - publisher={Multidisciplinary Digital Publishing Institute} -} -``` +LiDAR-based or RGB-D-based object detection is used in numerous applications, ranging from autonomous driving to robot vision. Voxel-based 3D convolutional networks have been used for some time to enhance the retention of information when processing point cloud LiDAR data. However, problems remain, including a slow inference speed and low orientation estimation performance. We therefore investigate an improved sparse convolution method for such networks, which significantly increases the speed of both training and inference. We also introduce a new form of angle loss regression to improve the orientation estimation performance and a new data augmentation approach that can enhance the convergence speed and performance. The proposed network produces state-of-the-art results on the KITTI 3D object detection benchmarks while maintaining a fast inference speed. -## Results +
+ +
+ +## Introduction + +We implement SECOND and provide the results and checkpoints on KITTI dataset. + +## Results and models ### KITTI | Backbone |Class| Lr schd | Mem (GB) | Inf time (fps) | mAP |Download | | :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: | | [SECFPN](./hv_second_secfpn_6x8_80e_kitti-3d-car.py)| Car |cyclic 80e|5.4||79.07|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/second/hv_second_secfpn_6x8_80e_kitti-3d-car/hv_second_secfpn_6x8_80e_kitti-3d-car_20200620_230238-393f000c.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/second/hv_second_secfpn_6x8_80e_kitti-3d-car/hv_second_secfpn_6x8_80e_kitti-3d-car_20200620_230238.log.json)| +| [SECFPN (FP16)](./hv_second_secfpn_fp16_6x8_80e_kitti-3d-car.py)| Car |cyclic 80e|2.9||78.72|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car_20200924_211301-1f5ad833.pth)| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car_20200924_211301.log.json)| | [SECFPN](./hv_second_secfpn_6x8_80e_kitti-3d-3class.py)| 3 Class |cyclic 80e|5.4||64.41|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/second/hv_second_secfpn_6x8_80e_kitti-3d-3class/hv_second_secfpn_6x8_80e_kitti-3d-3class_20200620_230238-9208083a.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/second/hv_second_secfpn_6x8_80e_kitti-3d-3class/hv_second_secfpn_6x8_80e_kitti-3d-3class_20200620_230238.log.json)| +| [SECFPN (FP16)](./hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py)| 3 Class |cyclic 80e|2.9||67.4|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class_20200925_110059-05f67bdf.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class_20200925_110059.log.json)| ### Waymo @@ -34,4 +36,19 @@ We implement SECOND and provide the results and checkpoints on KITTI dataset. | above @ Pedestrian|||2x|8.12||68.1|59.1|59.5|51.5| | | above @ Cyclist|||2x|8.12||60.7|59.5|58.4|57.3| | -Note: See more details about metrics and data split on Waymo [HERE](https://github.com/open-mmlab/mmdetection3d/tree/master/configs/pointpillars). For implementation details, we basically follow the original settings. All of these results are achieved without bells-and-whistles, e.g. ensemble, multi-scale training and test augmentation. +Note: + +- See more details about metrics and data split on Waymo [HERE](https://github.com/open-mmlab/mmdetection3d/tree/master/configs/pointpillars). For implementation details, we basically follow the original settings. All of these results are achieved without bells-and-whistles, e.g. ensemble, multi-scale training and test augmentation. +- `FP16` means Mixed Precision (FP16) is adopted in training. + +## Citation + +```latex +@article{yan2018second, + title={Second: Sparsely embedded convolutional detection}, + author={Yan, Yan and Mao, Yuxing and Li, Bo}, + journal={Sensors}, + year={2018}, + publisher={Multidisciplinary Digital Publishing Institute} +} +``` diff --git a/configs/second/hv_second_secfpn_6x8_80e_kitti-3d-car.py b/configs/second/hv_second_secfpn_6x8_80e_kitti-3d-car.py index c4f2ffd51a..9ab7350ac4 100644 --- a/configs/second/hv_second_secfpn_6x8_80e_kitti-3d-car.py +++ b/configs/second/hv_second_secfpn_6x8_80e_kitti-3d-car.py @@ -12,7 +12,7 @@ _delete_=True, type='Anchor3DRangeGenerator', ranges=[[0, -40.0, -1.78, 70.4, 40.0, -1.78]], - sizes=[[1.6, 3.9, 1.56]], + sizes=[[3.9, 1.6, 1.56]], rotations=[0, 1.57], reshape_out=True)), # model training and testing settings diff --git a/configs/second/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py b/configs/second/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py new file mode 100644 index 0000000000..bf0336a45b --- /dev/null +++ b/configs/second/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py @@ -0,0 +1,3 @@ +_base_ = './hv_second_secfpn_6x8_80e_kitti-3d-3class.py' +# fp16 settings +fp16 = dict(loss_scale=512.) diff --git a/configs/second/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car.py b/configs/second/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car.py new file mode 100644 index 0000000000..efba55330d --- /dev/null +++ b/configs/second/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car.py @@ -0,0 +1,3 @@ +_base_ = './hv_second_secfpn_6x8_80e_kitti-3d-car.py' +# fp16 settings +fp16 = dict(loss_scale=512.) diff --git a/configs/second/hv_second_secfpn_sbn_2x16_2x_waymoD5-3d-3class.py b/configs/second/hv_second_secfpn_sbn_2x16_2x_waymoD5-3d-3class.py index aae54b33a8..6412f535d5 100644 --- a/configs/second/hv_second_secfpn_sbn_2x16_2x_waymoD5-3d-3class.py +++ b/configs/second/hv_second_secfpn_sbn_2x16_2x_waymoD5-3d-3class.py @@ -21,7 +21,10 @@ classes=class_names, sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10), points_loader=dict( - type='LoadPointsFromFile', load_dim=5, use_dim=[0, 1, 2, 3, 4])) + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=[0, 1, 2, 3, 4])) train_pipeline = [ dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=5), diff --git a/configs/second/metafile.yml b/configs/second/metafile.yml index bd6311d21e..1d83b90f8b 100644 --- a/configs/second/metafile.yml +++ b/configs/second/metafile.yml @@ -57,3 +57,41 @@ Models: mAPH@L1: 61.7 mAP@L2: 58.9 mAPH@L2: 55.7 + + - Name: hv_second_secfpn_fp16_6x8_80e_kitti-3d-car + In Collection: SECOND + Config: configs/second/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car.py + Metadata: + Training Techniques: + - AdamW + - Mixed Precision Training + Training Resources: 8x TITAN Xp + Training Data: KITTI + Training Memory (GB): 2.9 + Results: + - Task: 3D Object Detection + Dataset: KITTI + Metrics: + mAP: 78.72 + Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car_20200924_211301-1f5ad833.pth + Code: + Version: v0.7.0 + + - Name: hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class + In Collection: SECOND + Config: configs/second/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py + Metadata: + Training Techniques: + - AdamW + - Mixed Precision Training + Training Resources: 8x TITAN Xp + Training Data: KITTI + Training Memory (GB): 2.9 + Results: + - Task: 3D Object Detection + Dataset: KITTI + Metrics: + mAP: 67.4 + Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class_20200925_110059-05f67bdf.pth + Code: + Version: v0.7.0 diff --git a/configs/smoke/README.md b/configs/smoke/README.md new file mode 100644 index 0000000000..9c3a4bf3d8 --- /dev/null +++ b/configs/smoke/README.md @@ -0,0 +1,47 @@ +# SMOKE: Single-Stage Monocular 3D Object Detection via Keypoint Estimation + +> [SMOKE: Single-Stage Monocular 3D Object Detection via Keypoint Estimation](https://arxiv.org/abs/2002.10111) + + + +## Abstract + +Estimating 3D orientation and translation of objects is essential for infrastructure-less autonomous navigation and driving. In case of monocular vision, successful methods have been mainly based on two ingredients: (i) a network generating 2D region proposals, (ii) a R-CNN structure predicting 3D object pose by utilizing the acquired regions of interest. We argue that the 2D detection network is redundant and introduces non-negligible noise for 3D detection. Hence, we propose a novel 3D object detection method, named SMOKE, in this paper that predicts a 3D bounding box for each detected object by combining a single keypoint estimate with regressed 3D variables. As a second contribution, we propose a multi-step disentangling approach for constructing the 3D bounding box, which significantly improves both training convergence and detection accuracy. In contrast to previous 3D detection techniques, our method does not require complicated pre/post-processing, extra data, and a refinement stage. Despite of its structural simplicity, our proposed SMOKE network outperforms all existing monocular 3D detection methods on the KITTI dataset, giving the best state-of-the-art result on both 3D object detection and Bird's eye view evaluation. + +
+ +
+ +## Introduction + +We implement SMOKE and provide the results and checkpoints on KITTI dataset. + +## Results and models + +### KITTI + +| Backbone | Lr schd | Mem (GB) | Inf time (fps) | mAP | Download | +| :---------: | :-----: | :------: | :------------: | :----: | :------: | +|[DLA34](./smoke_dla34_pytorch_dlaneck_gn-all_8x4_6x_kitti-mono3d.py)|6x|9.64||13.85|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/smoke/smoke_dla34_pytorch_dlaneck_gn-all_8x4_6x_kitti-mono3d_20210929_015553-d46d9bb0.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/smoke/smoke_dla34_pytorch_dlaneck_gn-all_8x4_6x_kitti-mono3d_20210929_015553.log.json) + +Note: mAP represents Car moderate 3D strict AP11 results. + +Detailed performance on KITTI 3D detection (3D/BEV) is as follows, evaluated by AP11 metric: + +| | Easy | Moderate | Hard | +|-------------|:-------------:|:--------------:|:------------:| +| Car | 16.92 / 22.97 | 13.85 / 18.32 | 11.90 / 15.88| +| Pedestrian | 11.13 / 12.61| 11.10 / 11.32 | 10.67 / 11.14| +| Cyclist | 0.99 / 1.47 | 0.54 / 0.65 | 0.55 / 0.67 | + +## Citation + +```latex +@inproceedings{liu2020smoke, + title={Smoke: Single-stage monocular 3d object detection via keypoint estimation}, + author={Liu, Zechen and Wu, Zizhang and T{\'o}th, Roland}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops}, + pages={996--997}, + year={2020} +} +``` diff --git a/configs/smoke/metafile.yml b/configs/smoke/metafile.yml new file mode 100644 index 0000000000..df956e4963 --- /dev/null +++ b/configs/smoke/metafile.yml @@ -0,0 +1,30 @@ +Collections: + - Name: SMOKE + Metadata: + Training Data: KITTI + Training Techniques: + - Adam + Training Resources: 4x V100 GPUS + Architecture: + - SMOKEMono3DHead + - DLA + Paper: + URL: https://arxiv.org/abs/2002.10111 + Title: 'SMOKE: Single-Stage Monocular 3D Object Detection via Keypoint Estimation' + README: configs/smoke/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/mmdet3d/models/detectors/smoke_mono3d.py#L7 + Version: v1.0.0 + +Models: + - Name: smoke_dla34_pytorch_dlaneck_gn-all_8x4_6x_kitti-mono3d + In Collection: SMOKE + Config: configs/smoke/smoke_dla34_pytorch_dlaneck_gn-all_8x4_6x_kitti-mono3d.py + Metadata: + Training Memory (GB): 9.6 + Results: + - Task: 3D Object Detection + Dataset: KITTI + Metrics: + mAP: 13.8 + Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/smoke/smoke_dla34_pytorch_dlaneck_gn-all_8x4_6x_kitti-mono3d_20210929_015553-d46d9bb0.pth diff --git a/configs/smoke/smoke_dla34_pytorch_dlaneck_gn-all_8x4_6x_kitti-mono3d.py b/configs/smoke/smoke_dla34_pytorch_dlaneck_gn-all_8x4_6x_kitti-mono3d.py new file mode 100644 index 0000000000..c802ce3083 --- /dev/null +++ b/configs/smoke/smoke_dla34_pytorch_dlaneck_gn-all_8x4_6x_kitti-mono3d.py @@ -0,0 +1,64 @@ +_base_ = [ + '../_base_/datasets/kitti-mono3d.py', '../_base_/models/smoke.py', + '../_base_/default_runtime.py' +] + +# optimizer +optimizer = dict(type='Adam', lr=2.5e-4) +optimizer_config = dict(grad_clip=None) +lr_config = dict(policy='step', warmup=None, step=[50]) + +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=72) +log_config = dict(interval=10) + +find_unused_parameters = True +class_names = ['Pedestrian', 'Cyclist', 'Car'] +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFileMono3D'), + dict( + type='LoadAnnotations3D', + with_bbox=True, + with_label=True, + with_attr_label=False, + with_bbox_3d=True, + with_label_3d=True, + with_bbox_depth=True), + dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), + dict(type='RandomShiftScale', shift_scale=(0.2, 0.4), aug_prob=0.3), + dict(type='AffineResize', img_scale=(1280, 384), down_ratio=4), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict( + type='Collect3D', + keys=[ + 'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_3d', 'gt_labels_3d', + 'centers2d', 'depths' + ]), +] +test_pipeline = [ + dict(type='LoadImageFromFileMono3D'), + dict( + type='MultiScaleFlipAug', + img_scale=(1280, 384), + flip=False, + transforms=[ + dict(type='AffineResize', img_scale=(1280, 384), down_ratio=4), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=8, + workers_per_gpu=4, + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/ssn/README.md b/configs/ssn/README.md index d47f0d90e9..c1801397ce 100644 --- a/configs/ssn/README.md +++ b/configs/ssn/README.md @@ -1,21 +1,22 @@ # SSN: Shape Signature Networks for Multi-class Object Detection from Point Clouds -## Introduction +> [SSN: Shape Signature Networks for Multi-class Object Detection from Point Clouds](https://arxiv.org/abs/2004.02774) -We implement PointPillars with Shape-aware grouping heads used in the SSN and provide the results and checkpoints on the nuScenes and Lyft dataset. +## Abstract -``` -@inproceedings{zhu2020ssn, - title={SSN: Shape Signature Networks for Multi-class Object Detection from Point Clouds}, - author={Zhu, Xinge and Ma, Yuexin and Wang, Tai and Xu, Yan and Shi, Jianping and Lin, Dahua}, - booktitle={Proceedings of the European Conference on Computer Vision}, - year={2020} -} -``` +Multi-class 3D object detection aims to localize and classify objects of multiple categories from point clouds. Due to the nature of point clouds, i.e. unstructured, sparse and noisy, some features benefit-ting multi-class discrimination are underexploited, such as shape information. In this paper, we propose a novel 3D shape signature to explore the shape information from point clouds. By incorporating operations of symmetry, convex hull and chebyshev fitting, the proposed shape sig-nature is not only compact and effective but also robust to the noise, which serves as a soft constraint to improve the feature capability of multi-class discrimination. Based on the proposed shape signature, we develop the shape signature networks (SSN) for 3D object detection, which consist of pyramid feature encoding part, shape-aware grouping heads and explicit shape encoding objective. Experiments show that the proposed method performs remarkably better than existing methods on two large-scale datasets. Furthermore, our shape signature can act as a plug-and-play component and ablation study shows its effectiveness and good scalability. + +
+ +
+ +## Introduction + +We implement PointPillars with Shape-aware grouping heads used in the SSN and provide the results and checkpoints on the nuScenes and Lyft dataset. -## Results +## Results and models ### NuScenes @@ -39,3 +40,14 @@ Note: The main difference of the shape-aware grouping heads with the original SECOND FPN heads is that the former groups objects with similar sizes and shapes together, and design shape-specific heads for each group. Heavier heads (with more convolutions and large strides) are designed for large objects while smaller heads for small objects. Note that there may appear different feature map sizes in the outputs, so an anchor generator tailored to these feature maps is also needed in the implementation. Users could try other settings in terms of the head design. Here we basically refer to the implementation [HERE](https://github.com/xinge008/SSN). + +## Citation + +```latex +@inproceedings{zhu2020ssn, + title={SSN: Shape Signature Networks for Multi-class Object Detection from Point Clouds}, + author={Zhu, Xinge and Ma, Yuexin and Wang, Tai and Xu, Yan and Shi, Jianping and Lin, Dahua}, + booktitle={Proceedings of the European Conference on Computer Vision}, + year={2020} +} +``` diff --git a/configs/ssn/hv_ssn_secfpn_sbn-all_2x16_2x_lyft-3d.py b/configs/ssn/hv_ssn_secfpn_sbn-all_2x16_2x_lyft-3d.py index 8a70d2a770..50b33c8019 100644 --- a/configs/ssn/hv_ssn_secfpn_sbn-all_2x16_2x_lyft-3d.py +++ b/configs/ssn/hv_ssn_secfpn_sbn-all_2x16_2x_lyft-3d.py @@ -96,15 +96,15 @@ [-100, -100, -0.6276341, 100, 100, -0.6276341], [-100, -100, -0.3033737, 100, 100, -0.3033737]], sizes=[ - [0.63, 1.76, 1.44], # bicycle - [0.96, 2.35, 1.59], # motorcycle - [0.76, 0.80, 1.76], # pedestrian - [0.35, 0.73, 0.50], # animal - [1.92, 4.75, 1.71], # car - [2.42, 6.52, 2.34], # emergency vehicle - [2.92, 12.70, 3.42], # bus - [2.75, 8.17, 3.20], # other vehicle - [2.84, 10.24, 3.44] # truck + [1.76, 0.63, 1.44], # bicycle + [2.35, 0.96, 1.59], # motorcycle + [0.80, 0.76, 1.76], # pedestrian + [0.73, 0.35, 0.50], # animal + [4.75, 1.92, 1.71], # car + [6.52, 2.42, 2.34], # emergency vehicle + [12.70, 2.92, 3.42], # bus + [8.17, 2.75, 3.20], # other vehicle + [10.24, 2.84, 3.44] # truck ], custom_values=[], rotations=[0, 1.57], @@ -137,7 +137,7 @@ ], assign_per_class=True, diff_rad_by_sin=True, - dir_offset=0.7854, # pi/4 + dir_offset=-0.7854, # -pi/4 dir_limit_offset=0, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7), loss_cls=dict( diff --git a/configs/ssn/hv_ssn_secfpn_sbn-all_2x16_2x_nus-3d.py b/configs/ssn/hv_ssn_secfpn_sbn-all_2x16_2x_nus-3d.py index 18b658b0c3..8550201416 100644 --- a/configs/ssn/hv_ssn_secfpn_sbn-all_2x16_2x_nus-3d.py +++ b/configs/ssn/hv_ssn_secfpn_sbn-all_2x16_2x_nus-3d.py @@ -94,16 +94,16 @@ [-50, -50, -1.80673031, 50, 50, -1.80673031], [-50, -50, -1.64824291, 50, 50, -1.64824291]], sizes=[ - [0.60058911, 1.68452161, 1.27192197], # bicycle - [0.76279481, 2.09973778, 1.44403034], # motorcycle - [0.66344886, 0.72564370, 1.75748069], # pedestrian - [0.39694519, 0.40359262, 1.06232151], # traffic cone - [2.49008838, 0.48578221, 0.98297065], # barrier - [1.95017717, 4.60718145, 1.72270761], # car - [2.45609390, 6.73778078, 2.73004906], # truck - [2.87427237, 12.01320693, 3.81509561], # trailer - [2.94046906, 11.1885991, 3.47030982], # bus - [2.73050468, 6.38352896, 3.13312415] # construction vehicle + [1.68452161, 0.60058911, 1.27192197], # bicycle + [2.09973778, 0.76279481, 1.44403034], # motorcycle + [0.72564370, 0.66344886, 1.75748069], # pedestrian + [0.40359262, 0.39694519, 1.06232151], # traffic cone + [0.48578221, 2.49008838, 0.98297065], # barrier + [4.60718145, 1.95017717, 1.72270761], # car + [6.73778078, 2.45609390, 2.73004906], # truck + [12.01320693, 2.87427237, 3.81509561], # trailer + [11.1885991, 2.94046906, 3.47030982], # bus + [6.38352896, 2.73050468, 3.13312415] # construction vehicle ], custom_values=[0, 0], rotations=[0, 1.57], @@ -144,7 +144,7 @@ ], assign_per_class=True, diff_rad_by_sin=True, - dir_offset=0.7854, # pi/4 + dir_offset=-0.7854, # -pi/4 dir_limit_offset=0, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9), loss_cls=dict( diff --git a/configs/votenet/README.md b/configs/votenet/README.md index 49d8c7065d..1b847f8545 100644 --- a/configs/votenet/README.md +++ b/configs/votenet/README.md @@ -1,21 +1,22 @@ # Deep Hough Voting for 3D Object Detection in Point Clouds -## Introduction +> [Deep Hough Voting for 3D Object Detection in Point Clouds](https://arxiv.org/abs/1904.09664) -We implement VoteNet and provide the result and checkpoints on ScanNet and SUNRGBD datasets. +## Abstract -``` -@inproceedings{qi2019deep, - author = {Qi, Charles R and Litany, Or and He, Kaiming and Guibas, Leonidas J}, - title = {Deep Hough Voting for 3D Object Detection in Point Clouds}, - booktitle = {Proceedings of the IEEE International Conference on Computer Vision}, - year = {2019} -} -``` +Current 3D object detection methods are heavily influenced by 2D detectors. In order to leverage architectures in 2D detectors, they often convert 3D point clouds to regular grids (i.e., to voxel grids or to bird's eye view images), or rely on detection in 2D images to propose 3D boxes. Few works have attempted to directly detect objects in point clouds. In this work, we return to first principles to construct a 3D detection pipeline for point cloud data and as generic as possible. However, due to the sparse nature of the data -- samples from 2D manifolds in 3D space -- we face a major challenge when directly predicting bounding box parameters from scene points: a 3D object centroid can be far from any surface point thus hard to regress accurately in one step. To address the challenge, we propose VoteNet, an end-to-end 3D object detection network based on a synergy of deep point set networks and Hough voting. Our model achieves state-of-the-art 3D detection on two large datasets of real 3D scans, ScanNet and SUN RGB-D with a simple design, compact model size and high efficiency. Remarkably, VoteNet outperforms previous methods by using purely geometric information without relying on color images. + +
+ +
+ +## Introduction + +We implement VoteNet and provide the result and checkpoints on ScanNet and SUNRGBD datasets. -## Results +## Results and models ### ScanNet @@ -54,3 +55,14 @@ iou_loss=dict(type='AxisAlignedIoULoss', reduction='sum', loss_weight=10.0 / 3.0 | [PointNet++](./votenet_iouloss_8x8_scannet-3d-18class.py) | 3x |4.1||63.81|44.21|/| For now, we only support calculating IoU loss for axis-aligned bounding boxes since the CUDA op of general 3D IoU calculation does not implement the backward method. Therefore, IoU loss can only be used for ScanNet dataset for now. + +## Citation + +```latex +@inproceedings{qi2019deep, + author = {Qi, Charles R and Litany, Or and He, Kaiming and Guibas, Leonidas J}, + title = {Deep Hough Voting for 3D Object Detection in Point Clouds}, + booktitle = {Proceedings of the IEEE International Conference on Computer Vision}, + year = {2019} +} +``` diff --git a/data/s3dis/collect_indoor3d_data.py b/data/s3dis/collect_indoor3d_data.py index 3dc4a05fd5..29fb959f41 100644 --- a/data/s3dis/collect_indoor3d_data.py +++ b/data/s3dis/collect_indoor3d_data.py @@ -1,7 +1,8 @@ import argparse +from os import path as osp + import mmcv from indoor3d_util import export -from os import path as osp parser = argparse.ArgumentParser() parser.add_argument( diff --git a/data/s3dis/indoor3d_util.py b/data/s3dis/indoor3d_util.py index a500a1d4e0..5ccb0af2c4 100644 --- a/data/s3dis/indoor3d_util.py +++ b/data/s3dis/indoor3d_util.py @@ -1,7 +1,8 @@ import glob -import numpy as np from os import path as osp +import numpy as np + # ----------------------------------------------------------------------------- # CONSTANTS # ----------------------------------------------------------------------------- diff --git a/data/scannet/batch_load_scannet_data.py b/data/scannet/batch_load_scannet_data.py index 60b53b3db4..90a3a44cb8 100644 --- a/data/scannet/batch_load_scannet_data.py +++ b/data/scannet/batch_load_scannet_data.py @@ -11,11 +11,12 @@ """ import argparse import datetime -import numpy as np import os -from load_scannet_data import export from os import path as osp +import numpy as np +from load_scannet_data import export + DONOTCARE_CLASS_IDS = np.array([]) OBJ_CLASS_IDS = np.array( [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36, 39]) diff --git a/data/scannet/extract_posed_images.py b/data/scannet/extract_posed_images.py index 7018f32d11..b9a0ecc7fe 100644 --- a/data/scannet/extract_posed_images.py +++ b/data/scannet/extract_posed_images.py @@ -1,13 +1,14 @@ # Modified from https://github.com/ScanNet/ScanNet/blob/master/SensReader/python/SensorData.py # noqa -import imageio -import mmcv -import numpy as np import os import struct import zlib from argparse import ArgumentParser from functools import partial +import imageio +import mmcv +import numpy as np + COMPRESSION_TYPE_COLOR = {-1: 'unknown', 0: 'raw', 1: 'png', 2: 'jpeg'} COMPRESSION_TYPE_DEPTH = { diff --git a/data/scannet/load_scannet_data.py b/data/scannet/load_scannet_data.py index 911bb4c7fe..548f63a93b 100644 --- a/data/scannet/load_scannet_data.py +++ b/data/scannet/load_scannet_data.py @@ -9,8 +9,9 @@ import argparse import inspect import json -import numpy as np import os + +import numpy as np import scannet_utils currentdir = os.path.dirname( @@ -90,7 +91,7 @@ def export(mesh_file, test_mode (bool): Whether is generating test data without labels. Default: False. - It returns a tuple, which containts the the following things: + It returns a tuple, which contains the the following things: np.ndarray: Vertices of points data. np.ndarray: Indexes of label. np.ndarray: Indexes of instance. diff --git a/data/scannet/scannet_utils.py b/data/scannet/scannet_utils.py index 5813098f89..46e160b496 100644 --- a/data/scannet/scannet_utils.py +++ b/data/scannet/scannet_utils.py @@ -8,8 +8,9 @@ """ import csv -import numpy as np import os + +import numpy as np from plyfile import PlyData diff --git a/docker/Dockerfile b/docker/Dockerfile index 48b83ce4d4..9628d7ba38 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -19,7 +19,7 @@ RUN pip install mmsegmentation==0.18.0 # Install MMDetection3D RUN conda clean --all -RUN git clone https://github.com/open-mmlab/mmdetection3d.git /mmdetection3d +COPY . /mmdetection3d WORKDIR /mmdetection3d ENV FORCE_CUDA="1" RUN pip install -r requirements/build.txt diff --git a/docker/serve/Dockerfile b/docker/serve/Dockerfile new file mode 100644 index 0000000000..d4c1a3930b --- /dev/null +++ b/docker/serve/Dockerfile @@ -0,0 +1,54 @@ +ARG PYTORCH="1.6.0" +ARG CUDA="10.1" +ARG CUDNN="7" +FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel + +ARG MMCV="1.3.8" +ARG MMSEGMENTATION="0.14.1" +ARG MMDET="2.14.0" +ARG MMDET3D="0.17.1" + +ENV PYTHONUNBUFFERED TRUE + +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ + ca-certificates \ + g++ \ + openjdk-11-jre-headless \ + # MMDet3D Requirements + ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 \ + && rm -rf /var/lib/apt/lists/* + +ENV PATH="/opt/conda/bin:$PATH" +RUN export FORCE_CUDA=1 + +# TORCHSEVER +RUN pip install torchserve torch-model-archiver + +# MMLAB +ARG PYTORCH +ARG CUDA +RUN ["/bin/bash", "-c", "pip install mmcv-full==${MMCV} -f https://download.openmmlab.com/mmcv/dist/cu${CUDA//./}/torch${PYTORCH}/index.html"] +RUN pip install mmdet==${MMDET} +RUN pip install mmsegmentation==${MMSEGMENTATION} +RUN pip install mmdet3d==${MMDET3D} + + +RUN useradd -m model-server \ + && mkdir -p /home/model-server/tmp + +COPY entrypoint.sh /usr/local/bin/entrypoint.sh + +RUN chmod +x /usr/local/bin/entrypoint.sh \ + && chown -R model-server /home/model-server + +COPY config.properties /home/model-server/config.properties +RUN mkdir /home/model-server/model-store && chown -R model-server /home/model-server/model-store + +EXPOSE 8080 8081 8082 + +USER model-server +WORKDIR /home/model-server +ENV TEMP=/home/model-server/tmp +ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] +CMD ["serve"] diff --git a/docker/serve/config.properties b/docker/serve/config.properties new file mode 100644 index 0000000000..efb9c47e40 --- /dev/null +++ b/docker/serve/config.properties @@ -0,0 +1,5 @@ +inference_address=http://0.0.0.0:8080 +management_address=http://0.0.0.0:8081 +metrics_address=http://0.0.0.0:8082 +model_store=/home/model-server/model-store +load_models=all diff --git a/docker/serve/entrypoint.sh b/docker/serve/entrypoint.sh new file mode 100644 index 0000000000..41ba00b048 --- /dev/null +++ b/docker/serve/entrypoint.sh @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +if [[ "$1" = "serve" ]]; then + shift 1 + torchserve --start --ts-config /home/model-server/config.properties +else + eval "$@" +fi + +# prevent docker exit +tail -f /dev/null diff --git a/docs/en/1_exist_data_model.md b/docs/en/1_exist_data_model.md index 0c680e6a25..0c8f4571fc 100644 --- a/docs/en/1_exist_data_model.md +++ b/docs/en/1_exist_data_model.md @@ -9,6 +9,7 @@ For high-level apis easier to integrated into other projects and basic demos, pl ### Test existing models on standard datasets - single GPU +- CPU - single node multiple GPU - multiple node @@ -18,10 +19,18 @@ You can use the following commands to test a dataset. # single-gpu testing python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] [--show] [--show-dir ${SHOW_DIR}] +# CPU: disable GPUs and run single-gpu testing script (experimental) +export CUDA_VISIBLE_DEVICES=-1 +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] [--show] [--show-dir ${SHOW_DIR}] + # multi-gpu testing ./tools/dist_test.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${GPU_NUM} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] ``` +**Note**: + +For now, CPU testing is only supported for SMOKE. + Optional arguments: - `RESULT_FILE`: Filename of the output results in pickle format. If not specified, the results will not be saved to a file. - `EVAL_METRICS`: Items to be evaluated on the results. Allowed values depend on the dataset. Typically we default to use official metrics for evaluation on different datasets, so it can be simply set to `mAP` as a placeholder for detection tasks, which applies to nuScenes, Lyft, ScanNet and SUNRGBD. For KITTI, if we only want to evaluate the 2D detection performance, we can simply set the metric to `img_bbox` (unstable, stay tuned). For Waymo, we provide both KITTI-style evaluation (unstable) and Waymo-style official protocol, corresponding to metric `kitti` and `waymo` respectively. We recommend to use the default official metric for stable performance and fair comparison with other methods. Similarly, the metric can be set to `mIoU` for segmentation tasks, which applies to S3DIS and ScanNet. @@ -145,6 +154,20 @@ python tools/train.py ${CONFIG_FILE} [optional arguments] If you want to specify the working directory in the command, you can add an argument `--work-dir ${YOUR_WORK_DIR}`. +### Training with CPU (experimental) + +The process of training on the CPU is consistent with single GPU training. We just need to disable GPUs before the training process. + +```shell +export CUDA_VISIBLE_DEVICES=-1 +``` + +And then run the script of train with a single GPU. + +**Note**: + +For now, most of the point cloud related algorithms rely on 3D CUDA op, which can not be trained on CPU. Some monocular 3D object detection algorithms, like FCOS3D and SMOKE can be trained on CPU. We do not recommend users to use CPU for training because it is too slow. We support this feature to allow users to debug certain models on machines without GPU for convenience. + ### Train with multiple GPUs ```shell diff --git a/docs/en/changelog.md b/docs/en/changelog.md index 515cdcd804..54a4601ee5 100644 --- a/docs/en/changelog.md +++ b/docs/en/changelog.md @@ -1,5 +1,85 @@ ## Changelog +### v1.0.0rc0 (18/2/2022) + +#### Compatibility + +- We refactor our three coordinate systems to make their rotation directions and origins more consistent, and further remove unnecessary hacks in different datasets and models. Therefore, please re-generate data infos or convert the old version to the new one with our provided scripts. We will also provide updated checkpoints in the next version. Please refer to the [compatibility documentation](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/docs/en/compatibility.md) for more details. +- Unify the camera keys for consistent transformation between coordinate systems on different datasets. The modification changes the key names to `lidar2img`, `depth2img`, `cam2img`, etc., for easier understanding. Customized codes using legacy keys may be influenced. +- The next release will begin to move files of CUDA ops to [MMCV](https://github.com/open-mmlab/mmcv). It will influence the way to import related functions. We will not break the compatibility but will raise a warning first and please prepare to migrate it. + +#### Highlights + +- Support new monocular 3D detectors: [PGD](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0/configs/pgd), [SMOKE](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0/configs/smoke), [MonoFlex](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0/configs/monoflex) +- Support a new LiDAR-based detector: [PointRCNN](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0/configs/point_rcnn) +- Support a new backbone: [DGCNN](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0/configs/dgcnn) +- Support 3D object detection on the S3DIS dataset +- Support compilation on Windows +- Full benchmark for PAConv on S3DIS +- Further enhancement for documentation, especially on the Chinese documentation + +#### New Features + +- Support 3D object detection on the S3DIS dataset (#835) +- Support PointRCNN (#842, #843, #856, #974, #1022, #1109, #1125) +- Support DGCNN (#896) +- Support PGD (#938, #940, #948, #950, #964, #1014, #1065, #1070, #1157) +- Support SMOKE (#939, #955, #959, #975, #988, #999, #1029) +- Support MonoFlex (#1026, #1044, #1114, #1115, #1183) +- Support CPU Training (#1196) + +#### Improvements + +- Support point sampling based on distance metric (#667, #840) +- Refactor coordinate systems (#677, #774, #803, #899, #906, #912, #968, #1001) +- Unify camera keys in PointFusion and transformations between different systems (#791, #805) +- Refine documentation (#792, #827, #829, #836, #849, #854, #859, #1111, #1113, #1116, #1121, #1132, #1135, #1185, #1193, #1226) +- Add a script to support benchmark regression (#808) +- Benchmark PAConvCUDA on S3DIS (#847) +- Support to download pdf and epub documentation (#850) +- Change the `repeat` setting in Group-Free-3D configs to reduce training epochs (#855) +- Support KITTI AP40 evaluation metric (#927) +- Add the mmdet3d2torchserve tool for SECOND (#977) +- Add code-spell pre-commit hook and fix typos (#995) +- Support the latest numba version (#1043) +- Set a default seed to use when the random seed is not specified (#1072) +- Distribute mix-precision models to each algorithm folder (#1074) +- Add abstract and a representative figure for each algorithm (#1086) +- Upgrade pre-commit hook (#1088, #1217) +- Support augmented data and ground truth visualization (#1092) +- Add local yaw property for `CameraInstance3DBoxes` (#1130) +- Lock the required numba version to 0.53.0 (#1159) +- Support the usage of plane information for KITTI dataset (#1162) +- Deprecate the support for "python setup.py test" (#1164) +- Reduce the number of multi-process threads to accelerate training (#1168) +- Support 3D flip augmentation for semantic segmentation (#1181) +- Update README format for each model (#1195) + +#### Bug Fixes + +- Fix compiling errors on Windows (#766) +- Fix the deprecated nms setting in the ImVoteNet config (#828) +- Use the latest `wrap_fp16_model` import from mmcv (#861) +- Remove 2D annotations generation on Lyft (#867) +- Update index files for the Chinese documentation to be consistent with the English version (#873) +- Fix the nested list transpose in the CenterPoint head (#879) +- Fix deprecated pretrained model loading for RegNet (#889) +- Fix the incorrect dimension indices of rotations and testing config in the CenterPoint test time augmentation (#892) +- Fix and improve visualization tools (#956, #1066, #1073) +- Fix PointPillars FLOPs calculation error (#1075) +- Fix missing dimension information in the SUN RGB-D data generation (#1120) +- Fix incorrect anchor range settings in the PointPillars [config](https://github.com/open-mmlab/mmdetection3d/blob/master/configs/_base_/models/hv_pointpillars_secfpn_kitti.py) for KITTI (#1163) +- Fix incorrect model information in the RegNet metafile (#1184) +- Fix bugs in non-distributed multi-gpu training and testing (#1197) +- Fix a potential assertion error when generating corners from an empty box (#1212) +- Upgrade bazel version according to the requirement of Waymo Devkit (#1223) + +#### Contributors + +A total of 12 developers contributed to this release. + +@THU17cyz, @wHao-Wu, @wangruohui, @Wuziyi616, @filaPro, @ZwwWayne, @Tai-Wang, @DCNSW, @xieenze, @robin-karlsson0, @ZCMax, @Otteri + ### v0.18.1 (1/2/2022) #### Improvements diff --git a/docs/en/compatibility.md b/docs/en/compatibility.md index 96e347fb46..e44bda7f29 100644 --- a/docs/en/compatibility.md +++ b/docs/en/compatibility.md @@ -1,3 +1,42 @@ +## v1.0.0.dev0 + +### Coordinate system refactoring + +In this version, we did a major code refactoring which improved the consistency among the three coordinate systems (and corresponding box representation), LiDAR, Camera, and Depth. A brief summary for this refactoring is as follows: + +- The three coordinate systems are all right-handed now (which means the yaw angle increases in the counterclockwise direction). +- The LiDAR system `(x_size, y_size, z_size)` corresponds to `(l, w, h)` instead of `(w, l, h)`. This is more natural since `l` is parallel with the direction where the yaw angle is zero, and we prefer using the positive direction of the `x` axis as that direction, which is exactly how we define yaw angle in Depth and Camera coordinate systems. +- The APIs for box-related operations are improved and now are more user-friendly. + +#### ***NOTICE!!*** + +Since definitions of box representation have changed, the annotation data of most datasets require updating: +- SUN RGB-D: Yaw angles in the annotation should be reversed. +- KITTI: For LiDAR boxes in GT databases, (x_size, y_size, z_size, yaw) out of (x, y, z, x_size, y_size, z_size) should be converted from the old LiDAR coordinate system to the new one. The training/validation data annotations should be left unchanged since they are under the Camera coordinate system, which is unmodified after the refactoring. +- Waymo: Same as KITTI. +- nuScenes: For LiDAR boxes in training/validation data and GT databases, (x_size, y_size, z_size, yaw) out of (x, y, z, x_size, y_size, z_size) should be converted. +- Lyft: Same as nuScenes. + +Please regenerate the data annotation/GT database files or use [`update_data_coords.py`](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/tools/update_data_coords.py) to update the data. + +To use boxes under Depth and LiDAR coordinate systems, or to convert boxes between different coordinate systems, users should be aware of the difference between the old and new definitions. For example, the rotation, flipping, and bev functions of [`DepthInstance3DBoxes`](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/mmdet3d/core/bbox/structures/depth_box3d.py) and [`LiDARInstance3DBoxes`](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/mdet3d/core/bbox/structures/lidar_box3d.py) and box conversion [functions](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/mmdet3d/core/bbox/structures/box_3d_mode.py) have all been reimplemented in the refactoring. + +Consequently, functions like [`output_to_lyft_box`](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/mmdet3d/datasets/lyft_dataset.py) undergo small modification to adapt to the new LiDAR/Depth box. + +Since the LiDAR system `(x_size, y_size, z_size)` now corresponds to `(l, w, h)` instead of `(w, l, h)`, the anchor sizes for LiDAR boxes are also changed, e.g., from `[1.6, 3.9, 1.56]` to `[3.9, 1.6, 1.56]`. + +Functions only involving points are generally unaffected except if they rely on some refactored utility functions such as `rotation_3d_in_axis`. + +#### Other BC-breaking or new features: + +- `array_converter`: Please refer to [array_converter.py](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/mmdet3d/core/utils/array_converter.py). Functions wrapped with `array_converter` can convert array-like input types of `torch.Tensor`, `np.ndarray`, and `list/tuple/float` to `torch.Tensor` to process in an unified PyTorch pipeline. The result may finally be converted back to the input type. Most functions in [utils.py](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/mmdet3d/core/bbox/structures/utils.py) are wrapped with `array_converter`. +- [`points_in_boxes`](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/mmdet3d/core/bbox/structures/base_box3d.py) and [`points_in_boxes_batch`](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/mmdet3d/core/bbox/structures/base_box3d.py) will be deprecated soon. They are renamed to `points_in_boxes_part` and `points_in_boxes_all` respectively, with more detailed docstrings. The major difference of the two functions is that if a point is enclosed by multiple boxes, `points_in_boxes_part` will only return the index of the first enclosing box while `points_in_boxes_all` will return all the indices of enclosing boxes. +- `rotation_3d_in_axis`: Please refer to [utils.py](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/mmdet3d/core/bbox/structures/utils.py). Now this function supports multiple input types and more options. The function with the same name in [box_np_ops.py](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/mmdet3d/core/bbox/box_np_ops.py) is deleted since we do not need another function to tackle with NumPy data. `rotation_2d`, `points_cam2img`, and `limit_period` in box_np_ops.py are also deleted for the same reason. +- `bev` method of [`CameraInstance3DBoxes`](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/mmdet3d/core/bbox/structures/cam_box3d.py): Changed it to be consistent with the definition of bev in Depth and LiDAR coordinate systems. +- Data augmentation utils in [data_augment_utils.py](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/mmdet3d/datasets/pipelines/data_augment_utils.py) now follow the rules of a right-handed system. +- We do not need the yaw hacking in KITTI anymore after refining [`get_direction_target`](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/mmdet3d/models/dense_heads/train_mixins.py). Interested users may refer to PR [#677](https://github.com/open-mmlab/mmdetection3d/pull/677) . + + ## 0.16.0 ### Returned values of `QueryAndGroup` operation diff --git a/docs/en/conf.py b/docs/en/conf.py index 52e05fc6ba..8187056939 100644 --- a/docs/en/conf.py +++ b/docs/en/conf.py @@ -11,9 +11,10 @@ # documentation root, use os.path.abspath to make it absolute, like shown here. # import os -import pytorch_sphinx_theme import subprocess import sys + +import pytorch_sphinx_theme from m2r import MdInclude from recommonmark.transform import AutoStructify from sphinx.builders.html import StandaloneHTMLBuilder diff --git a/docs/en/data_preparation.md b/docs/en/data_preparation.md index 970f594238..159f248cc9 100644 --- a/docs/en/data_preparation.md +++ b/docs/en/data_preparation.md @@ -78,7 +78,7 @@ mmdetection3d ### KITTI -Download KITTI 3D detection data [HERE](http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d). Prepare KITTI data by running +Download KITTI 3D detection data [HERE](http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d). Prepare KITTI data splits by running ```bash mkdir ./data/kitti/ && mkdir ./data/kitti/ImageSets @@ -88,10 +88,20 @@ wget -c https://raw.githubusercontent.com/traveller59/second.pytorch/master/sec wget -c https://raw.githubusercontent.com/traveller59/second.pytorch/master/second/data/ImageSets/train.txt --no-check-certificate --content-disposition -O ./data/kitti/ImageSets/train.txt wget -c https://raw.githubusercontent.com/traveller59/second.pytorch/master/second/data/ImageSets/val.txt --no-check-certificate --content-disposition -O ./data/kitti/ImageSets/val.txt wget -c https://raw.githubusercontent.com/traveller59/second.pytorch/master/second/data/ImageSets/trainval.txt --no-check-certificate --content-disposition -O ./data/kitti/ImageSets/trainval.txt +``` + +Then generate info files by running +``` python tools/create_data.py kitti --root-path ./data/kitti --out-dir ./data/kitti --extra-tag kitti ``` +In an environment using slurm, users may run the following command instead + +``` +sh tools/create_data.sh kitti +``` + ### Waymo Download Waymo open dataset V1.2 [HERE](https://waymo.com/open/download/) and its data split [HERE](https://drive.google.com/drive/folders/18BVuF_RYJF0NjZpt8SnfzANiakoRMf0o?usp=sharing). Then put tfrecord files into corresponding folders in `data/waymo/waymo_format/` and put the data split txt files into `data/waymo/kitti_format/ImageSets`. Download ground truth bin file for validation set [HERE](https://console.cloud.google.com/storage/browser/waymo_open_dataset_v_1_2_0/validation/ground_truth_objects) and put it into `data/waymo/waymo_format/`. A tip is that you can use `gsutil` to download the large-scale dataset with commands. You can take this [tool](https://github.com/RalphMao/Waymo-Dataset-Tool) as an example for more details. Subsequently, prepare waymo data by running diff --git a/docs/en/datasets/kitti_det.md b/docs/en/datasets/kitti_det.md index 2562da92f2..3398402443 100644 --- a/docs/en/datasets/kitti_det.md +++ b/docs/en/datasets/kitti_det.md @@ -6,7 +6,7 @@ This page provides specific tutorials about the usage of MMDetection3D for KITTI ## Prepare dataset -You can download KITTI 3D detection data [HERE](http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d) and unzip all zip files. +You can download KITTI 3D detection data [HERE](http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d) and unzip all zip files. Besides, the road planes could be downloaded from [HERE](https://download.openmmlab.com/mmdetection3d/data/train_planes.zip), which are optional for data augmentation during training for better performance. The road planes are generated by [AVOD](https://github.com/kujason/avod), you can see more details [HERE](https://github.com/kujason/avod/issues/19). Like the general way to prepare dataset, it is recommended to symlink the dataset root to `$MMDETECTION3D/data`. @@ -29,6 +29,7 @@ mmdetection3d │ │ │ ├── image_2 │ │ │ ├── label_2 │ │ │ ├── velodyne +│ │ │ ├── planes (optional) ``` ### Create KITTI dataset diff --git a/docs/en/datasets/scannet_det.md b/docs/en/datasets/scannet_det.md index 3d775552e1..6e35dfd9d4 100644 --- a/docs/en/datasets/scannet_det.md +++ b/docs/en/datasets/scannet_det.md @@ -110,8 +110,8 @@ def export(mesh_file, instance_ids[verts] = object_id if object_id not in object_id_to_label_id: object_id_to_label_id[object_id] = label_ids[verts][0] - # bbox format is [x, y, z, dx, dy, dz, label_id] - # [x, y, z] is gravity center of bbox, [dx, dy, dz] is axis-aligned + # bbox format is [x, y, z, x_size, y_size, z_size, label_id] + # [x, y, z] is gravity center of bbox, [x_size, y_size, z_size] is axis-aligned # [label_id] is semantic label id in 'nyu40id' standard # Note: since 3D bbox is axis-aligned, the yaw is 0. unaligned_bboxes = extract_bbox(mesh_vertices, object_id_to_segs, diff --git a/docs/en/faq.md b/docs/en/faq.md index 112b9609db..0cb93b57b4 100644 --- a/docs/en/faq.md +++ b/docs/en/faq.md @@ -19,12 +19,6 @@ We list some potential troubles encountered by users and developers, along with **NOTE**: We have migrated to use pycocotools in mmdet3d >= 0.13.0. -- If you face the error shown below, and your environment contains numba == 0.48.0 with numpy >= 1.20.0: - - ``TypeError: expected dtype object, got 'numpy.dtype[bool_]'`` - - please downgrade numpy to < 1.20.0 or install numba == 0.48 from source, because in numpy == 1.20.0, `np.dtype` produces subclass due to API change. Please refer to [here](https://github.com/numba/numba/issues/6041) for more details. - - If you face the error shown below when importing pycocotools: ``ValueError: numpy.ndarray size changed, may indicate binary incompatibility. Expected 88 from C header, got 80 from PyObject`` diff --git a/docs/en/getting_started.md b/docs/en/getting_started.md index 4ff2e380fa..a483eaa626 100644 --- a/docs/en/getting_started.md +++ b/docs/en/getting_started.md @@ -13,6 +13,7 @@ The required versions of MMCV, MMDetection and MMSegmentation for different vers | MMDetection3D version | MMDetection version | MMSegmentation version | MMCV version | |:-------------------:|:-------------------:|:-------------------:|:-------------------:| | master | mmdet>=2.19.0, <=3.0.0| mmseg>=0.20.0, <=1.0.0 | mmcv-full>=1.3.8, <=1.5.0| +| v1.0.0rc0 | mmdet>=2.19.0, <=3.0.0| mmseg>=0.20.0, <=1.0.0 | mmcv-full>=1.3.8, <=1.5.0| | 0.18.1 | mmdet>=2.19.0, <=3.0.0| mmseg>=0.20.0, <=1.0.0 | mmcv-full>=1.3.8, <=1.5.0| | 0.18.0 | mmdet>=2.19.0, <=3.0.0| mmseg>=0.20.0, <=1.0.0 | mmcv-full>=1.3.8, <=1.5.0| | 0.17.3 | mmdet>=2.14.0, <=3.0.0| mmseg>=0.14.1, <=1.0.0 | mmcv-full>=1.3.8, <=1.4.0| @@ -179,7 +180,7 @@ We provide a [Dockerfile](https://github.com/open-mmlab/mmdetection3d/blob/maste ```shell # build an image with PyTorch 1.6, CUDA 10.1 -docker build -t mmdetection3d docker/ +docker build -t mmdetection3d -f docker/Dockerfile . ``` Run it with diff --git a/docs/en/model_zoo.md b/docs/en/model_zoo.md index 9192855959..05bd808989 100644 --- a/docs/en/model_zoo.md +++ b/docs/en/model_zoo.md @@ -60,7 +60,7 @@ Please refer to [ImVoteNet](https://github.com/open-mmlab/mmdetection3d/blob/mas ### FCOS3D -Please refer to [FCOS3D](https://github.com/open-mmlab/mmdetection3d/blob/master/configs/fcos3d) for details. We provide FCOS3D baselines on the nuScenes dataset currently. +Please refer to [FCOS3D](https://github.com/open-mmlab/mmdetection3d/blob/master/configs/fcos3d) for details. We provide FCOS3D baselines on the nuScenes dataset. ### PointNet++ @@ -77,3 +77,27 @@ Please refer to [ImVoxelNet](https://github.com/open-mmlab/mmdetection3d/blob/ma ### PAConv Please refer to [PAConv](https://github.com/open-mmlab/mmdetection3d/blob/master/configs/paconv) for details. We provide PAConv baselines on S3DIS dataset. + +### DGCNN + +Please refer to [DGCNN](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0/configs/dgcnn) for details. We provide DGCNN baselines on S3DIS dataset. + +### SMOKE + +Please refer to [SMOKE](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0/configs/smoke) for details. We provide SMOKE baselines on KITTI dataset. + +### PGD + +Please refer to [PGD](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0/configs/pgd) for details. We provide PGD baselines on KITTI and nuScenes dataset. + +### PointRCNN + +Please refer to [PointRCNN](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0/configs/point_rcnn) for details. We provide PointRCNN baselines on KITTI dataset. + +### MonoFlex + +Please refer to [MonoFlex](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0/configs/monoflex) for details. We provide MonoFlex baselines on KITTI dataset. + +### Mixed Precision (FP16) Training + +Please refer [Mixed Precision (FP16) Training] on PointPillars (https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0/configs/pointpillars/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d.py) for details. diff --git a/docs/en/stat.py b/docs/en/stat.py index 68a945b3d8..b5f10a8845 100755 --- a/docs/en/stat.py +++ b/docs/en/stat.py @@ -1,10 +1,11 @@ #!/usr/bin/env python import functools as func import glob -import numpy as np import re from os import path as osp +import numpy as np + url_prefix = 'https://github.com/open-mmlab/mmdetection3d/blob/master/' files = sorted(glob.glob('../configs/*/README.md')) diff --git a/docs/en/tutorials/coord_sys_tutorial.md b/docs/en/tutorials/coord_sys_tutorial.md new file mode 100644 index 0000000000..f256ee0602 --- /dev/null +++ b/docs/en/tutorials/coord_sys_tutorial.md @@ -0,0 +1,242 @@ +# Tutorial 6: Coordinate System + +## Overview + +MMDetection3D uses three different coordinate systems. The existence of different coordinate systems in the society of 3D object detection is necessary, because for various 3D data collection devices, such as LiDAR, depth camera, etc., the coordinate systems are not consistent, and different 3D datasets also follow different data formats. Early works, such as SECOND, VoteNet, convert the raw data to another format, forming conventions that some later works also follow, making the conversion between coordinate systems even more complicated. + +Despite the variety of datasets and equipment, by summarizing the line of works on 3D object detection we can roughly categorize coordinate systems into three: + +- Camera coordinate system -- the coordinate system of most cameras, in which the positive direction of the y-axis points to the ground, the positive direction of the x-axis points to the right, and the positive direction of the z-axis points to the front. + ``` + up z front + | ^ + | / + | / + | / + |/ + left ------ 0 ------> x right + | + | + | + | + v + y down + ``` +- LiDAR coordinate system -- the coordinate system of many LiDARs, in which the negative direction of the z-axis points to the ground, the positive direction of the x-axis points to the front, and the positive direction of the y-axis points to the left. + ``` + z up x front + ^ ^ + | / + | / + | / + |/ + y left <------ 0 ------ right + ``` +- Depth coordinate system -- the coordinate system used by VoteNet, H3DNet, etc., in which the negative direction of the z-axis points to the ground, the positive direction of the x-axis points to the right, and the positive direction of the y-axis points to the front. + ``` + z up y front + ^ ^ + | / + | / + | / + |/ + left ------ 0 ------> x right + ``` + +The definition of coordinate systems in this tutorial is actually **more than just defining the three axes**. For a box in the form of ``$$`(x, y, z, dx, dy, dz, r)`$$``, our coordinate systems also define how to interpret the box dimensions ``$$`(dx, dy, dz)`$$`` and the yaw angle ``$$`r`$$``. + +The illustration of the three coordinate systems is shown below: + +![](https://raw.githubusercontent.com/open-mmlab/mmdetection3d/v1.0.0.dev0/resources/coord_sys_all.png) + +The three figures above are the 3D coordinate systems while the three figures below are the bird's eye view. + +We will stick to the three coordinate systems defined in this tutorial in the future. + +## Definition of the yaw angle + +Please refer to [wikipedia](https://en.wikipedia.org/wiki/Euler_angles#Tait%E2%80%93Bryan_angles) for the standard definition of the yaw angle. In object detection, we choose an axis as the gravity axis, and a reference direction on the plane ``$$`\Pi`$$`` perpendicular to the gravity axis, then the reference direction has a yaw angle of 0, and other directions on ``$$`\Pi`$$`` have non-zero yaw angles depending on its angle with the reference direction. + +Currently, for all supported datasets, annotations do not include pitch angle and roll angle, which means we need only consider the yaw angle when predicting boxes and calculating overlap between boxes. + +In MMDetection3D, all three coordinate systems are right-handed coordinate systems, which means the ascending direction of the yaw angle is counter-clockwise if viewed from the negative direction of the gravity axis (the axis is pointing at one's eyes). + +The figure below shows that, in this right-handed coordinate system, if we set the positive direction of the x-axis as a reference direction, then the positive direction of the y-axis has a yaw angle of ``$$`\frac{\pi}{2}`$$``. + +``` + z up y front (yaw=0.5*pi) + ^ ^ + | / + | / + | / + |/ +left (yaw=pi) ------ 0 ------> x right (yaw=0) +``` + +For a box, the value of its yaw angle equals its direction minus a reference direction. In all three coordinate systems in MMDetection3D, the reference direction is always the positive direction of the x-axis, while the direction of a box is defined to be parallel with the x-axis if its yaw angle is 0. The definition of the yaw angle of a box is illustrated in the figure below. + +``` +y front + ^ box direction (yaw=0.5*pi) + /|\ ^ + | /|\ + | ____|____ + | | | | + | | | | +__|____|____|____|______\ x right + | | | | / + | | | | + | |____|____| + | +``` + +## Definition of the box dimensions + +The definition of the box dimensions cannot be disentangled with the definition of the yaw angle. In the previous section, we said that the direction of a box is defined to be parallel with the x-axis if its yaw angle is 0. Then naturally, the dimension of a box which corresponds to the x-axis should be ``$$`dx`$$``. However, this is not always the case in some datasets (we will address that later). + +The following figures show the meaning of the correspondence between the x-axis and ``$$`dx`$$``, and between the y-axis and ``$$`dy`$$``. + +``` +y front + ^ box direction (yaw=0.5*pi) + /|\ ^ + | /|\ + | ____|____ + | | | | + | | | | dx +__|____|____|____|______\ x right + | | | | / + | | | | + | |____|____| + | dy +``` + +Note that the box direction is always parallel with the edge ``$$`dx`$$``. + +``` +y front + ^ _________ + /|\ | | | + | | | | + | | | | dy + | |____|____|____\ box direction (yaw=0) + | | | | / +__|____|____|____|_________\ x right + | | | | / + | |____|____| + | dx + | +``` + +## Relation with raw coordinate systems of supported datasets + +### KITTI + +The raw annotation of KITTI is under camera coordinate system, see [get_label_anno](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/tools/data_converter/kitti_data_utils.py). In MMDetection3D, to train LiDAR-based models on KITTI, the data is first converted from camera coordinate system to LiDAR coordinate system, see [get_ann_info](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/mmdet3d/datasets/kitti_dataset.py). For training vision-based models, the data is kept in the camera coordinate system. + +In SECOND, the LiDAR coordinate system for a box is defined as follows (a bird's eye view): + +![](https://raw.githubusercontent.com/traveller59/second.pytorch/master/images/kittibox.png) + + + +For each box, the dimensions are ``$$`(w, l, h)`$$``, and the reference direction for the yaw angle is the positive direction of the y axis. For more details, refer to the [repo](https://github.com/traveller59/second.pytorch#concepts). + +Our LiDAR coordinate system has two changes: + +- The yaw angle is defined to be right-handed instead of left-handed for consistency; +- The box dimensions are ``$$`(l, w, h)`$$`` instead of ``$$`(w, l, h)`$$``, since ``$$`w`$$`` corresponds to ``$$`dy`$$`` and ``$$`l`$$`` corresponds to ``$$`dx`$$`` in KITTI. + +### Waymo + +We use the KITTI-format data of Waymo dataset. Therefore, KITTI and Waymo also share the same coordinate system in our implementation. + +### NuScenes + +NuScenes provides a toolkit for evaluation, in which each box is wrapped into a `Box` instance. The coordinate system of `Box` is different from our LiDAR coordinate system in that the first two elements of the box dimension correspond to ``$$`(dy, dx)`$$``, or ``$$`(w, l)`$$``, respectively, instead of the reverse. For more details, please refer to the NuScenes [tutorial](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/docs/datasets/nuscenes_det.md#notes). + +Readers may refer to the [NuScenes development kit](https://github.com/nutonomy/nuscenes-devkit/tree/master/python-sdk/nuscenes/eval/detection) for the definition of a [NuScenes box](https://github.com/nutonomy/nuscenes-devkit/blob/2c6a752319f23910d5f55cc995abc547a9e54142/python-sdk/nuscenes/utils/data_classes.py#L457) and implementation of [NuScenes evaluation](https://github.com/nutonomy/nuscenes-devkit/blob/master/python-sdk/nuscenes/eval/detection/evaluate.py). + +### Lyft + +Lyft shares the same data format with NuScenes as far as coordinate system is involved. + +Please refer to the [official website](https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/data) for more information. + +### ScanNet + +The raw data of ScanNet is not point cloud but mesh. The sampled point cloud data is under our depth coordinate system. For ScanNet detection task, the box annotations are axis-aligned, and the yaw angle is always zero. Therefore the direction of the yaw angle in our depth coordinate system makes no difference regarding ScanNet. + +### SUN RGB-D + +The raw data of SUN RGB-D is not point cloud but RGB-D image. By back projection, we obtain the corresponding point cloud for each image, which is under our Depth coordinate system. However, the annotation is not under our system and thus needs conversion. + +For the conversion from raw annotation to annotation under our Depth coordinate system, please refer to [sunrgbd_data_utils.py](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/tools/data_converter/sunrgbd_data_utils.py). + +### S3DIS + +S3DIS shares the same coordinate system as ScanNet in our implementation. However, S3DIS is a segmentation-task-only dataset, and thus no annotation is coordinate system sensitive. + +## Examples + +### Box conversion (between different coordinate systems) + +Take the conversion between our Camera coordinate system and LiDAR coordinate system as an example: + +First, for points and box centers, the coordinates before and after the conversion satisfy the following relationship: + +- ``$$`x_{LiDAR}=z_{camera}`$$`` +- ``$$`y_{LiDAR}=-x_{camera}`$$`` +- ``$$`z_{LiDAR}=-y_{camera}`$$`` + +Then, the box dimensions before and after the conversion satisfy the following relationship: + +- ``$$`dx_{LiDAR}=dx_{camera}`$$`` +- ``$$`dy_{LiDAR}=dz_{camera}`$$`` +- ``$$`dz_{LiDAR}=dy_{camera}`$$`` + +Finally, the yaw angle should also be converted: + +- ``$$`r_{LiDAR}=-\frac{\pi}{2}-r_{camera}`$$`` + +See the code [here](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/mmdet3d/core/bbox/structures/box_3d_mode.py) for more details. + +### Bird's Eye View + +The BEV of a camera coordinate system box is ``$$`(x, z, dx, dz, -r)`$$`` if the 3D box is ``$$`(x, y, z, dx, dy, dz, r)`$$``. The inversion of the sign of the yaw angle is because the positive direction of the gravity axis of the Camera coordinate system points to the ground. + +See the code [here](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/mmdet3d/core/bbox/structures/cam_box3d.py) for more details. + +### Rotation of boxes + +We set the rotation of all kinds of boxes to be counter-clockwise about the gravity axis. Therefore, to rotate a 3D box we first calculate the new box center, and then we add the rotation angle to the yaw angle. + +See the code [here](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/mmdet3d/core/bbox/structures/cam_box3d.py) for more details. + +## Common FAQ + +#### Q1: Are the box related ops universal to all coordinate system types? + +No. For example, the ops under [this folder](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/mmdet3d/ops/roiaware_pool3d) are applicable to boxes under Depth or LiDAR coordinate system only. The evaluation functions for KITTI dataset [here](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/mmdet3d/core/evaluation/kitti_utils) are only applicable to boxes under Camera coordinate system since the rotation is clockwise if viewed from above. + +For each box related op, we have marked the type of boxes to which we can apply the op. + +#### Q2: In every coordinate system, do the three axes point exactly to the right, the front, and the ground, respectively? + +No. For example, in KITTI, we need a calibration matrix when converting from Camera coordinate system to LiDAR coordinate system. + +#### Q3: How does a phase difference of ``$$`2\pi`$$`` in the yaw angle of a box affect evaluation? + +For IoU calculation, a phase difference of ``$$`2\pi`$$`` in the yaw angle will result in the same box, thus not affecting evaluation. + +For angle prediction evaluation such as the NDS metric in NuScenes and the AOS metric in KITTI, the angle of predicted boxes will be first standardized, so the phase difference of ``$$`2\pi`$$`` will not change the result. + +#### Q4: How does a phase difference of ``$$`\pi`$$`` in the yaw angle of a box affect evaluation? + +For IoU calculation, a phase difference of ``$$`\pi`$$`` in the yaw angle will result in the same box, thus not affecting evaluation. + +However, for angle prediction evaluation, this will result in the exact opposite direction. + +Just think about a car. The yaw angle is the angle between the direction of the car front and the positive direction of the x-axis. If we add ``$$`\pi`$$`` to this angle, the car front will become the car rear. + +For categories such as barrier, the front and the rear have no difference, therefore a phase difference of ``$$`\pi`$$`` will not affect the angle prediction score. diff --git a/docs/en/tutorials/index.rst b/docs/en/tutorials/index.rst index dec6ee8133..5ecd9f6e48 100644 --- a/docs/en/tutorials/index.rst +++ b/docs/en/tutorials/index.rst @@ -6,3 +6,4 @@ data_pipeline.md customize_models.md customize_runtime.md + coord_sys_tutorial.md diff --git a/docs/en/useful_tools.md b/docs/en/useful_tools.md index 06131f6b60..a1a5d85dd5 100644 --- a/docs/en/useful_tools.md +++ b/docs/en/useful_tools.md @@ -71,7 +71,7 @@ To see the prediction results during evaluation, you can run the following comma python tools/test.py ${CONFIG_FILE} ${CKPT_PATH} --eval 'mAP' --eval-options 'show=True' 'out_dir=${SHOW_DIR}' ``` -After running this command, you will obtain the input data, the output of networks and ground-truth labels visualized on the input (e.g. `***_points.obj`, `***_pred.obj`, `***_gt.obj`, `***_img.png` and `***_pred.png` in multi-modality detection task) in `${SHOW_DIR}`. When `show` is enabled, [Open3D](http://www.open3d.org/) will be used to visualize the results online. You need to set `show=False` while running test in remote server without GUI. +After running this command, you will obtain the input data, the output of networks and ground-truth labels visualized on the input (e.g. `***_points.obj`, `***_pred.obj`, `***_gt.obj`, `***_img.png` and `***_pred.png` in multi-modality detection task) in `${SHOW_DIR}`. When `show` is enabled, [Open3D](http://www.open3d.org/) will be used to visualize the results online. If you are running test in remote server without GUI, the online visualization is not supported, you can set `show=False` to only save the output results in `{SHOW_DIR}`. As for offline visualization, you will have two options. To visualize the results with `Open3D` backend, you can run the following command @@ -96,6 +96,12 @@ python tools/misc/browse_dataset.py configs/_base_/datasets/kitti-3d-3class.py - **Notice**: Once specifying `--output-dir`, the images of views specified by users will be saved when pressing `_ESC_` in open3d window. If you don't have a monitor, you can remove the `--online` flag to only save the visualization results and browse them offline. +To verify the data consistency and the effect of data augmentation, you can also add `--aug` flag to visualize the data after data augmentation using the command as below: + +```shell +python tools/misc/browse_dataset.py configs/_base_/datasets/kitti-3d-3class.py --task det --aug --output-dir ${OUTPUT_DIR} --online +``` + If you also want to show 2D images with 3D bounding boxes projected onto them, you need to find a config that supports multi-modality data loading, and then change the `--task` args to `multi_modality-det`. An example is showed below ```shell @@ -122,6 +128,64 @@ python tools/misc/browse_dataset.py configs/_base_/datasets/nus-mono3d.py --task   +# Model Serving + +**Note**: This tool is still experimental now, only SECOND is supported to be served with [`TorchServe`](https://pytorch.org/serve/). We'll support more models in the future. + +In order to serve an `MMDetection3D` model with [`TorchServe`](https://pytorch.org/serve/), you can follow the steps: + +## 1. Convert the model from MMDetection3D to TorchServe + +```shell +python tools/deployment/mmdet3d2torchserve.py ${CONFIG_FILE} ${CHECKPOINT_FILE} \ +--output-folder ${MODEL_STORE} \ +--model-name ${MODEL_NAME} +``` + +**Note**: ${MODEL_STORE} needs to be an absolute path to a folder. + +## 2. Build `mmdet3d-serve` docker image + +```shell +docker build -t mmdet3d-serve:latest docker/serve/ +``` + +## 3. Run `mmdet3d-serve` + +Check the official docs for [running TorchServe with docker](https://github.com/pytorch/serve/blob/master/docker/README.md#running-torchserve-in-a-production-docker-environment). + +In order to run it on the GPU, you need to install [nvidia-docker](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). You can omit the `--gpus` argument in order to run on the CPU. + +Example: + +```shell +docker run --rm \ +--cpus 8 \ +--gpus device=0 \ +-p8080:8080 -p8081:8081 -p8082:8082 \ +--mount type=bind,source=$MODEL_STORE,target=/home/model-server/model-store \ +mmdet3d-serve:latest +``` + +[Read the docs](https://github.com/pytorch/serve/blob/072f5d088cce9bb64b2a18af065886c9b01b317b/docs/rest_api.md/) about the Inference (8080), Management (8081) and Metrics (8082) APis + +## 4. Test deployment + +You can use `test_torchserver.py` to compare result of torchserver and pytorch. + +```shell +python tools/deployment/test_torchserver.py ${IMAGE_FILE} ${CONFIG_FILE} ${CHECKPOINT_FILE} ${MODEL_NAME} +[--inference-addr ${INFERENCE_ADDR}] [--device ${DEVICE}] [--score-thr ${SCORE_THR}] +``` + +Example: + +```shell +python tools/deployment/test_torchserver.py demo/data/kitti/kitti_000008.bin configs/second/hv_second_secfpn_6x8_80e_kitti-3d-car.py checkpoints/hv_second_secfpn_6x8_80e_kitti-3d-car_20200620_230238-393f000c.pth second +``` + +  + # Model Complexity You can use `tools/analysis_tools/get_flops.py` in MMDetection3D, a script adapted from [flops-counter.pytorch](https://github.com/sovrasov/flops-counter.pytorch), to compute the FLOPs and params of a given model. diff --git a/docs/zh_cn/1_exist_data_model.md b/docs/zh_cn/1_exist_data_model.md index d0a0bd2506..8858fd89a0 100644 --- a/docs/zh_cn/1_exist_data_model.md +++ b/docs/zh_cn/1_exist_data_model.md @@ -18,10 +18,18 @@ # 单块显卡测试 python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] [--show] [--show-dir ${SHOW_DIR}] +# CPU:禁用显卡并运行单块 CPU 测试脚本(实验性) +export CUDA_VISIBLE_DEVICES=-1 +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] [--show] [--show-dir ${SHOW_DIR}] + # 多块显卡测试 ./tools/dist_test.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${GPU_NUM} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] ``` +**注意**: + +目前我们只支持 SMOKE 的 CPU 推理测试。 + 可选参数: - `RESULT_FILE`:输出结果(pickle 格式)的文件名,如果未指定,结果不会被保存。 - `EVAL_METRICS`:在结果上评测的项,不同的数据集有不同的合法值。具体来说,我们默认对不同的数据集都使用各自的官方度量方法进行评测,所以对 nuScenes、Lyft、ScanNet 和 SUNRGBD 这些数据集来说在检测任务上可以简单设置为 `mAP`;对 KITTI 数据集来说,如果我们只想评测 2D 检测效果,可以将度量方法设置为 `img_bbox`;对于 Waymo 数据集,我们提供了 KITTI 风格(不稳定)和 Waymo 官方风格这两种评测方法,分别对应 `kitti` 和 `waymo`,我们推荐使用默认的官方度量方法,它的性能稳定而且可以与其它算法公平比较;同样地,对 S3DIS、ScanNet 这些数据集来说,在分割任务上的度量方法可以设置为 `mIoU`。 @@ -143,6 +151,20 @@ python tools/train.py ${CONFIG_FILE} [optional arguments] 如果你想在命令中指定工作目录,添加参数 `--work-dir ${YOUR_WORK_DIR}`。 +### 使用 CPU 进行训练 (实验性) + +在 CPU 上训练的过程与单 GPU 训练一致。 我们只需要在训练过程之前禁用显卡。 + +```shell +export CUDA_VISIBLE_DEVICES=-1 +``` + +之后运行单显卡训练脚本即可。 + +**注意**: + +目前,大多数点云相关算法都依赖于 3D CUDA 算子,无法在 CPU 上进行训练。 一些单目 3D 物体检测算法,例如 FCOS3D、SMOKE 可以在 CPU 上进行训练。我们不推荐用户使用 CPU 进行训练,这太过缓慢。我们支持这个功能是为了方便用户在没有显卡的机器上调试某些特定的方法。 + ### 使用多块显卡进行训练 ```shell diff --git a/docs/zh_cn/conf.py b/docs/zh_cn/conf.py index 95532d4fd1..803e05413b 100644 --- a/docs/zh_cn/conf.py +++ b/docs/zh_cn/conf.py @@ -11,9 +11,10 @@ # documentation root, use os.path.abspath to make it absolute, like shown here. # import os -import pytorch_sphinx_theme import subprocess import sys + +import pytorch_sphinx_theme from m2r import MdInclude from recommonmark.transform import AutoStructify from sphinx.builders.html import StandaloneHTMLBuilder diff --git a/docs/zh_cn/datasets/kitti_det.md b/docs/zh_cn/datasets/kitti_det.md index d901518df2..85253bf8e9 100644 --- a/docs/zh_cn/datasets/kitti_det.md +++ b/docs/zh_cn/datasets/kitti_det.md @@ -6,7 +6,7 @@ ## 数据准备 -您可以在[这里](http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d)下载 KITTI 3D 检测数据并解压缩所有 zip 文件。 +您可以在[这里](http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d)下载 KITTI 3D 检测数据并解压缩所有 zip 文件。此外,您可以在[这里](https://download.openmmlab.com/mmdetection3d/data/train_planes.zip)下载道路平面信息,其在训练过程中作为一个可选项,用来提高模型的性能。道路平面信息由 [AVOD](https://github.com/kujason/avod) 生成,你可以在[这里](https://github.com/kujason/avod/issues/19)查看更多细节。 像准备数据集的一般方法一样,建议将数据集根目录链接到 `$MMDETECTION3D/data`。 @@ -29,6 +29,7 @@ mmdetection3d │ │ │ ├── image_2 │ │ │ ├── label_2 │ │ │ ├── velodyne +│ │ │ ├── planes (optional) ``` ### 创建 KITTI 数据集 diff --git a/docs/zh_cn/faq.md b/docs/zh_cn/faq.md index d473aa2a03..19a116e15f 100644 --- a/docs/zh_cn/faq.md +++ b/docs/zh_cn/faq.md @@ -19,12 +19,6 @@ **注意**: 我们已经在 0.13.0 及之后的版本中全面支持 pycocotools。 -- 如果您遇到下面的问题,并且您的环境包含 numba == 0.48.0 和 numpy >= 1.20.0: - - ``TypeError: expected dtype object, got 'numpy.dtype[bool_]'`` - - 请将 numpy 的版本降级至 < 1.20.0,或者从源码安装 numba == 0.48,这是由于 numpy == 1.20.0 改变了 API,使得在调用 `np.dtype` 会产生子类。请参考 [这里](https://github.com/numba/numba/issues/6041) 获取更多细节。 - - 如果您在导入 pycocotools 相关包时遇到下面的问题: ``ValueError: numpy.ndarray size changed, may indicate binary incompatibility. Expected 88 from C header, got 80 from PyObject`` diff --git a/docs/zh_cn/getting_started.md b/docs/zh_cn/getting_started.md index 7de264a6c4..cd0772ccdc 100644 --- a/docs/zh_cn/getting_started.md +++ b/docs/zh_cn/getting_started.md @@ -10,6 +10,7 @@ | MMDetection3D version | MMDetection version | MMSegmentation version | MMCV version | |:-------------------:|:-------------------:|:-------------------:|:-------------------:| | master | mmdet>=2.19.0, <=3.0.0| mmseg>=0.20.0, <=1.0.0 | mmcv-full>=1.3.8, <=1.5.0| +| v1.0.0rc0 | mmdet>=2.19.0, <=3.0.0| mmseg>=0.20.0, <=1.0.0 | mmcv-full>=1.3.8, <=1.5.0| | 0.18.1 | mmdet>=2.19.0, <=3.0.0| mmseg>=0.20.0, <=1.0.0 | mmcv-full>=1.3.8, <=1.5.0| | 0.18.0 | mmdet>=2.19.0, <=3.0.0| mmseg>=0.20.0, <=1.0.0 | mmcv-full>=1.3.8, <=1.5.0| | 0.17.3 | mmdet>=2.14.0, <=3.0.0| mmseg>=0.14.1, <=1.0.0 | mmcv-full>=1.3.8, <=1.4.0| diff --git a/docs/zh_cn/model_zoo.md b/docs/zh_cn/model_zoo.md index d897bf9dfd..52d0d906aa 100644 --- a/docs/zh_cn/model_zoo.md +++ b/docs/zh_cn/model_zoo.md @@ -75,3 +75,31 @@ ### ImVoxelNet 请参考 [ImVoxelNet](https://github.com/open-mmlab/mmdetection3d/blob/master/configs/imvoxelnet) 获取更多细节,我们在 KITTI 数据集上给出了相应的结果。 + +### PAConv + +请参考 [PAConv](https://github.com/open-mmlab/mmdetection3d/blob/master/configs/paconv) 获取更多细节,我们在 S3DIS 数据集上给出了相应的结果. + +### DGCNN + +请参考 [DGCNN](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0/configs/dgcnn) 获取更多细节,我们在 S3DIS 数据集上给出了相应的结果. + +### SMOKE + +请参考 [SMOKE](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0/configs/smoke) 获取更多细节,我们在 KITTI 数据集上给出了相应的结果. + +### PGD + +请参考 [PGD](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0/configs/pgd) 获取更多细节,我们在 KITTI 和 nuScenes 数据集上给出了相应的结果. + +### PointRCNN + +请参考 [PointRCNN](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0/configs/point_rcnn) 获取更多细节,我们在 KITTI 数据集上给出了相应的结果. + +### MonoFlex + +请参考 [MonoFlex](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0/configs/monoflex) 获取更多细节,我们在 KITTI 数据集上给出了相应的结果. + +### Mixed Precision (FP16) Training + +细节请参考 [Mixed Precision (FP16) Training] 在 PointPillars 训练的样例 (https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0/configs/pointpillars/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d.py). diff --git a/docs/zh_cn/stat.py b/docs/zh_cn/stat.py index 68a945b3d8..b5f10a8845 100755 --- a/docs/zh_cn/stat.py +++ b/docs/zh_cn/stat.py @@ -1,10 +1,11 @@ #!/usr/bin/env python import functools as func import glob -import numpy as np import re from os import path as osp +import numpy as np + url_prefix = 'https://github.com/open-mmlab/mmdetection3d/blob/master/' files = sorted(glob.glob('../configs/*/README.md')) diff --git a/docs/zh_cn/tutorials/index.rst b/docs/zh_cn/tutorials/index.rst index dec6ee8133..5ecd9f6e48 100644 --- a/docs/zh_cn/tutorials/index.rst +++ b/docs/zh_cn/tutorials/index.rst @@ -6,3 +6,4 @@ data_pipeline.md customize_models.md customize_runtime.md + coord_sys_tutorial.md diff --git a/docs/zh_cn/useful_tools.md b/docs/zh_cn/useful_tools.md index d2cde9af52..39f54da855 100644 --- a/docs/zh_cn/useful_tools.md +++ b/docs/zh_cn/useful_tools.md @@ -71,7 +71,7 @@ python tools/test.py ${CONFIG_FILE} ${CKPT_PATH} --show --show-dir ${SHOW_DIR} python tools/test.py ${CONFIG_FILE} ${CKPT_PATH} --eval 'mAP' --eval-options 'show=True' 'out_dir=${SHOW_DIR}' ``` -在运行这个指令后,您将会在 `${SHOW_DIR}` 获得输入数据、可视化在输入上的网络输出和真值标签(例如:在多模态检测任务中的`***_points.obj`,`***_pred.obj`,`***_gt.obj`,`***_img.png` 和 `***_pred.png` )。当 `show` 被激活,[Open3D](http://www.open3d.org/) 将会被用来在线可视化结果。当在没有 GUI 的远程服务器上运行测试的时候,您需要设定 `show=False`。 +在运行这个指令后,您将会在 `${SHOW_DIR}` 获得输入数据、可视化在输入上的网络输出和真值标签(例如:在多模态检测任务中的`***_points.obj`,`***_pred.obj`,`***_gt.obj`,`***_img.png` 和 `***_pred.png` )。当 `show` 被激活,[Open3D](http://www.open3d.org/) 将会被用来在线可视化结果。当您在没有 GUI 的远程服务器上运行测试的时候,无法进行在线可视化,您可以设定 `show=False` 将输出结果保存在 `{SHOW_DIR}`。 至于离线可视化,您将有两个选择。 利用 `Open3D` 后端可视化结果,您可以运行下面的指令 @@ -97,6 +97,12 @@ python tools/misc/browse_dataset.py configs/_base_/datasets/kitti-3d-3class.py - **注意**:一旦指定 `--output-dir` ,当按下 open3d 窗口的 `_ESC_`,用户指定的视图图像将被保存。如果您没有显示器,您可以移除 `--online` 标志,从而仅仅保存可视化结果并且进行离线浏览。 +为了验证数据的一致性和数据增强的效果,您还可以使用以下命令添加 `--aug` 标志来可视化数据增强后的数据: + +```shell +python tools/misc/browse_dataset.py configs/_base_/datasets/kitti-3d-3class.py --task det --aug --output-dir ${OUTPUT_DIR} --online +``` + 如果您还想显示 2D 图像以及投影的 3D 边界框,则需要找到支持多模态数据加载的配置文件,然后将 `--task` 参数更改为 `multi_modality-det`。一个例子如下所示 ```shell @@ -123,6 +129,64 @@ python tools/misc/browse_dataset.py configs/_base_/datasets/nus-mono3d.py --task   +# 模型部署 + +**Note**: 此工具仍然处于试验阶段,目前只有 SECOND 支持用 [`TorchServe`](https://pytorch.org/serve/) 部署,我们将会在未来支持更多的模型。 + +为了使用 [`TorchServe`](https://pytorch.org/serve/) 部署 `MMDetection3D` 模型,您可以遵循以下步骤: + +## 1. 将模型从 MMDetection3D 转换到 TorchServe + +```shell +python tools/deployment/mmdet3d2torchserve.py ${CONFIG_FILE} ${CHECKPOINT_FILE} \ +--output-folder ${MODEL_STORE} \ +--model-name ${MODEL_NAME} +``` + +**Note**: ${MODEL_STORE} 需要为文件夹的绝对路径。 + +## 2. 构建 `mmdet3d-serve` 镜像 + +```shell +docker build -t mmdet3d-serve:latest docker/serve/ +``` + +## 3. 运行 `mmdet3d-serve` + +查看官网文档来 [使用 docker 运行 TorchServe](https://github.com/pytorch/serve/blob/master/docker/README.md#running-torchserve-in-a-production-docker-environment)。 + +为了在 GPU 上运行,您需要安装 [nvidia-docker](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)。您可以忽略 `--gpus` 参数,从而在 CPU 上运行。 + +例子: + +```shell +docker run --rm \ +--cpus 8 \ +--gpus device=0 \ +-p8080:8080 -p8081:8081 -p8082:8082 \ +--mount type=bind,source=$MODEL_STORE,target=/home/model-server/model-store \ +mmdet3d-serve:latest +``` + +[阅读文档](https://github.com/pytorch/serve/blob/072f5d088cce9bb64b2a18af065886c9b01b317b/docs/rest_api.md/) 关于 Inference (8080), Management (8081) and Metrics (8082) 接口。 + +## 4. 测试部署 + +您可以使用 `test_torchserver.py` 进行部署, 同时比较 torchserver 和 pytorch 的结果。 + +```shell +python tools/deployment/test_torchserver.py ${IMAGE_FILE} ${CONFIG_FILE} ${CHECKPOINT_FILE} ${MODEL_NAME} +[--inference-addr ${INFERENCE_ADDR}] [--device ${DEVICE}] [--score-thr ${SCORE_THR}] +``` + +例子: + +```shell +python tools/deployment/test_torchserver.py demo/data/kitti/kitti_000008.bin configs/second/hv_second_secfpn_6x8_80e_kitti-3d-car.py checkpoints/hv_second_secfpn_6x8_80e_kitti-3d-car_20200620_230238-393f000c.pth second +``` + +  + # 模型复杂度 您可以使用 MMDetection 中的 `tools/analysis_tools/get_flops.py` 这个脚本文件,基于 [flops-counter.pytorch](https://github.com/sovrasov/flops-counter.pytorch) 计算一个给定模型的计算量 (FLOPS) 和参数量 (params)。 diff --git a/mmdet3d/apis/__init__.py b/mmdet3d/apis/__init__.py index 1063648ab1..5befc10d51 100644 --- a/mmdet3d/apis/__init__.py +++ b/mmdet3d/apis/__init__.py @@ -4,10 +4,11 @@ inference_multi_modality_detector, inference_segmentor, init_model, show_result_meshlab) from .test import single_gpu_test -from .train import train_model +from .train import init_random_seed, train_model __all__ = [ 'inference_detector', 'init_model', 'single_gpu_test', 'inference_mono_3d_detector', 'show_result_meshlab', 'convert_SyncBN', - 'train_model', 'inference_multi_modality_detector', 'inference_segmentor' + 'train_model', 'inference_multi_modality_detector', 'inference_segmentor', + 'init_random_seed' ] diff --git a/mmdet3d/apis/inference.py b/mmdet3d/apis/inference.py index 8779392be1..a83bd13d89 100644 --- a/mmdet3d/apis/inference.py +++ b/mmdet3d/apis/inference.py @@ -1,14 +1,15 @@ # Copyright (c) OpenMMLab. All rights reserved. +import re +from copy import deepcopy +from os import path as osp + import mmcv import numpy as np -import re import torch -from copy import deepcopy from mmcv.parallel import collate, scatter from mmcv.runner import load_checkpoint -from os import path as osp -from mmdet3d.core import (Box3DMode, CameraInstance3DBoxes, +from mmdet3d.core import (Box3DMode, CameraInstance3DBoxes, Coord3DMode, DepthInstance3DBoxes, LiDARInstance3DBoxes, show_multi_modality_result, show_result, show_seg_result) @@ -83,26 +84,53 @@ def inference_detector(model, pcd): """ cfg = model.cfg device = next(model.parameters()).device # model device + + if not isinstance(pcd, str): + cfg = cfg.copy() + # set loading pipeline type + cfg.data.test.pipeline[0].type = 'LoadPointsFromDict' + # build the data pipeline test_pipeline = deepcopy(cfg.data.test.pipeline) test_pipeline = Compose(test_pipeline) box_type_3d, box_mode_3d = get_box_type(cfg.data.test.box_type_3d) - data = dict( - pts_filename=pcd, - box_type_3d=box_type_3d, - box_mode_3d=box_mode_3d, - # for ScanNet demo we need axis_align_matrix - ann_info=dict(axis_align_matrix=np.eye(4)), - sweeps=[], - # set timestamp = 0 - timestamp=[0], - img_fields=[], - bbox3d_fields=[], - pts_mask_fields=[], - pts_seg_fields=[], - bbox_fields=[], - mask_fields=[], - seg_fields=[]) + + if isinstance(pcd, str): + # load from point clouds file + data = dict( + pts_filename=pcd, + box_type_3d=box_type_3d, + box_mode_3d=box_mode_3d, + # for ScanNet demo we need axis_align_matrix + ann_info=dict(axis_align_matrix=np.eye(4)), + sweeps=[], + # set timestamp = 0 + timestamp=[0], + img_fields=[], + bbox3d_fields=[], + pts_mask_fields=[], + pts_seg_fields=[], + bbox_fields=[], + mask_fields=[], + seg_fields=[]) + else: + # load from http + data = dict( + points=pcd, + box_type_3d=box_type_3d, + box_mode_3d=box_mode_3d, + # for ScanNet demo we need axis_align_matrix + ann_info=dict(axis_align_matrix=np.eye(4)), + sweeps=[], + # set timestamp = 0 + timestamp=[0], + img_fields=[], + bbox3d_fields=[], + pts_mask_fields=[], + pts_seg_fields=[], + bbox_fields=[], + mask_fields=[], + seg_fields=[]) data = test_pipeline(data) data = collate([data], samples_per_gpu=1) if next(model.parameters()).is_cuda: @@ -317,8 +345,7 @@ def show_det_result_meshlab(data, # for now we convert points into depth mode box_mode = data['img_metas'][0][0]['box_mode_3d'] if box_mode != Box3DMode.DEPTH: - points = points[..., [1, 0, 2]] - points[..., 0] *= -1 + points = Coord3DMode.convert(points, box_mode, Coord3DMode.DEPTH) show_bboxes = Box3DMode.convert(pred_bboxes, box_mode, Box3DMode.DEPTH) else: show_bboxes = deepcopy(pred_bboxes) @@ -462,15 +489,17 @@ def show_result_meshlab(data, data (dict): Contain data from pipeline. result (dict): Predicted result from model. out_dir (str): Directory to save visualized result. - score_thr (float): Minimum score of bboxes to be shown. Default: 0.0 - show (bool): Visualize the results online. Defaults to False. - snapshot (bool): Whether to save the online results. Defaults to False. - task (str): Distinguish which task result to visualize. Currently we - support 3D detection, multi-modality detection and 3D segmentation. - Defaults to 'det'. - palette (list[list[int]]] | np.ndarray | None): The palette of - segmentation map. If None is given, random palette will be - generated. Defaults to None. + score_thr (float, optional): Minimum score of bboxes to be shown. + Default: 0.0 + show (bool, optional): Visualize the results online. Defaults to False. + snapshot (bool, optional): Whether to save the online results. + Defaults to False. + task (str, optional): Distinguish which task result to visualize. + Currently we support 3D detection, multi-modality detection and + 3D segmentation. Defaults to 'det'. + palette (list[list[int]]] | np.ndarray, optional): The palette + of segmentation map. If None is given, random palette will be + generated. Defaults to None. """ assert task in ['det', 'multi_modality-det', 'seg', 'mono-det'], \ f'unsupported visualization task {task}' diff --git a/mmdet3d/apis/test.py b/mmdet3d/apis/test.py index 3b9d932fe5..c0e66c07f8 100644 --- a/mmdet3d/apis/test.py +++ b/mmdet3d/apis/test.py @@ -1,8 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. +from os import path as osp + import mmcv import torch from mmcv.image import tensor2imgs -from os import path as osp from mmdet3d.models import (Base3DDetector, Base3DSegmentor, SingleStageMono3DDetector) @@ -22,9 +23,9 @@ def single_gpu_test(model, Args: model (nn.Module): Model to be tested. data_loader (nn.Dataloader): Pytorch data loader. - show (bool): Whether to save viualization results. + show (bool, optional): Whether to save viualization results. Default: True. - out_dir (str): The path to save visualization results. + out_dir (str, optional): The path to save visualization results. Default: None. Returns: diff --git a/mmdet3d/apis/train.py b/mmdet3d/apis/train.py index 93659468e4..79b48f9327 100644 --- a/mmdet3d/apis/train.py +++ b/mmdet3d/apis/train.py @@ -1,8 +1,44 @@ # Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch +from mmcv.runner import get_dist_info +from torch import distributed as dist + from mmdet.apis import train_detector from mmseg.apis import train_segmentor +def init_random_seed(seed=None, device='cuda'): + """Initialize random seed. + + If the seed is not set, the seed will be automatically randomized, + and then broadcast to all processes to prevent some potential bugs. + Args: + seed (int, optional): The seed. Default to None. + device (str, optional): The device where the seed will be put on. + Default to 'cuda'. + Returns: + int: Seed to be used. + """ + if seed is not None: + return seed + + # Make sure all ranks share the same random seed to prevent + # some potential bugs. Please refer to + # https://github.com/open-mmlab/mmdetection/issues/6339 + rank, world_size = get_dist_info() + seed = np.random.randint(2**31) + if world_size == 1: + return seed + + if rank == 0: + random_num = torch.tensor(seed, dtype=torch.int32, device=device) + else: + random_num = torch.tensor(0, dtype=torch.int32, device=device) + dist.broadcast(random_num, src=0) + return random_num.item() + + def train_model(model, dataset, cfg, diff --git a/mmdet3d/core/anchor/anchor_3d_generator.py b/mmdet3d/core/anchor/anchor_3d_generator.py index ab085f9982..e8681b71da 100644 --- a/mmdet3d/core/anchor/anchor_3d_generator.py +++ b/mmdet3d/core/anchor/anchor_3d_generator.py @@ -19,20 +19,26 @@ class Anchor3DRangeGenerator(object): ranges (list[list[float]]): Ranges of different anchors. The ranges are the same across different feature levels. But may vary for different anchor sizes if size_per_range is True. - sizes (list[list[float]]): 3D sizes of anchors. - scales (list[int]): Scales of anchors in different feature levels. - rotations (list[float]): Rotations of anchors in a feature grid. - custom_values (tuple[float]): Customized values of that anchor. For - example, in nuScenes the anchors have velocities. - reshape_out (bool): Whether to reshape the output into (N x 4). - size_per_range: Whether to use separate ranges for different sizes. - If size_per_range is True, the ranges should have the same length - as the sizes, if not, it will be duplicated. + sizes (list[list[float]], optional): 3D sizes of anchors. + Defaults to [[3.9, 1.6, 1.56]]. + scales (list[int], optional): Scales of anchors in different feature + levels. Defaults to [1]. + rotations (list[float], optional): Rotations of anchors in a feature + grid. Defaults to [0, 1.5707963]. + custom_values (tuple[float], optional): Customized values of that + anchor. For example, in nuScenes the anchors have velocities. + Defaults to (). + reshape_out (bool, optional): Whether to reshape the output into + (N x 4). Defaults to True. + size_per_range (bool, optional): Whether to use separate ranges for + different sizes. If size_per_range is True, the ranges should have + the same length as the sizes, if not, it will be duplicated. + Defaults to True. """ def __init__(self, ranges, - sizes=[[1.6, 3.9, 1.56]], + sizes=[[3.9, 1.6, 1.56]], scales=[1], rotations=[0, 1.5707963], custom_values=(), @@ -86,13 +92,14 @@ def grid_anchors(self, featmap_sizes, device='cuda'): Args: featmap_sizes (list[tuple]): List of feature map sizes in multiple feature levels. - device (str): Device where the anchors will be put on. + device (str, optional): Device where the anchors will be put on. + Defaults to 'cuda'. Returns: - list[torch.Tensor]: Anchors in multiple feature levels. \ - The sizes of each tensor should be [N, 4], where \ - N = width * height * num_base_anchors, width and height \ - are the sizes of the corresponding feature lavel, \ + list[torch.Tensor]: Anchors in multiple feature levels. + The sizes of each tensor should be [N, 4], where + N = width * height * num_base_anchors, width and height + are the sizes of the corresponding feature level, num_base_anchors is the number of anchors for that level. """ assert self.num_levels == len(featmap_sizes) @@ -149,7 +156,7 @@ def anchors_single_range(self, feature_size, anchor_range, scale=1, - sizes=[[1.6, 3.9, 1.56]], + sizes=[[3.9, 1.6, 1.56]], rotations=[0, 1.5707963], device='cuda'): """Generate anchors in a single range. @@ -161,14 +168,18 @@ def anchors_single_range(self, shape [6]. The order is consistent with that of anchors, i.e., (x_min, y_min, z_min, x_max, y_max, z_max). scale (float | int, optional): The scale factor of anchors. - sizes (list[list] | np.ndarray | torch.Tensor): Anchor size with - shape [N, 3], in order of x, y, z. - rotations (list[float] | np.ndarray | torch.Tensor): Rotations of - anchors in a single feature grid. + Defaults to 1. + sizes (list[list] | np.ndarray | torch.Tensor, optional): + Anchor size with shape [N, 3], in order of x, y, z. + Defaults to [[3.9, 1.6, 1.56]]. + rotations (list[float] | np.ndarray | torch.Tensor, optional): + Rotations of anchors in a single feature grid. + Defaults to [0, 1.5707963]. device (str): Devices that the anchors will be put on. + Defaults to 'cuda'. Returns: - torch.Tensor: Anchors with shape \ + torch.Tensor: Anchors with shape [*feature_size, num_sizes, num_rots, 7]. """ if len(feature_size) == 2: @@ -231,10 +242,10 @@ class AlignedAnchor3DRangeGenerator(Anchor3DRangeGenerator): up corner to distribute anchors. Args: - anchor_corner (bool): Whether to align with the corner of the voxel - grid. By default it is False and the anchor's center will be + anchor_corner (bool, optional): Whether to align with the corner of the + voxel grid. By default it is False and the anchor's center will be the same as the corresponding voxel's center, which is also the - center of the corresponding greature grid. + center of the corresponding greature grid. Defaults to False. """ def __init__(self, align_corner=False, **kwargs): @@ -245,7 +256,7 @@ def anchors_single_range(self, feature_size, anchor_range, scale, - sizes=[[1.6, 3.9, 1.56]], + sizes=[[3.9, 1.6, 1.56]], rotations=[0, 1.5707963], device='cuda'): """Generate anchors in a single range. @@ -256,15 +267,18 @@ def anchors_single_range(self, anchor_range (torch.Tensor | list[float]): Range of anchors with shape [6]. The order is consistent with that of anchors, i.e., (x_min, y_min, z_min, x_max, y_max, z_max). - scale (float | int, optional): The scale factor of anchors. - sizes (list[list] | np.ndarray | torch.Tensor): Anchor size with - shape [N, 3], in order of x, y, z. - rotations (list[float] | np.ndarray | torch.Tensor): Rotations of - anchors in a single feature grid. - device (str): Devices that the anchors will be put on. + scale (float | int): The scale factor of anchors. + sizes (list[list] | np.ndarray | torch.Tensor, optional): + Anchor size with shape [N, 3], in order of x, y, z. + Defaults to [[3.9, 1.6, 1.56]]. + rotations (list[float] | np.ndarray | torch.Tensor, optional): + Rotations of anchors in a single feature grid. + Defaults to [0, 1.5707963]. + device (str, optional): Devices that the anchors will be put on. + Defaults to 'cuda'. Returns: - torch.Tensor: Anchors with shape \ + torch.Tensor: Anchors with shape [*feature_size, num_sizes, num_rots, 7]. """ if len(feature_size) == 2: @@ -334,7 +348,7 @@ class AlignedAnchor3DRangeGeneratorPerCls(AlignedAnchor3DRangeGenerator): Note that feature maps of different classes may be different. Args: - kwargs (dict): Arguments are the same as those in \ + kwargs (dict): Arguments are the same as those in :class:`AlignedAnchor3DRangeGenerator`. """ @@ -347,15 +361,16 @@ def grid_anchors(self, featmap_sizes, device='cuda'): """Generate grid anchors in multiple feature levels. Args: - featmap_sizes (list[tuple]): List of feature map sizes for \ + featmap_sizes (list[tuple]): List of feature map sizes for different classes in a single feature level. - device (str): Device where the anchors will be put on. + device (str, optional): Device where the anchors will be put on. + Defaults to 'cuda'. Returns: - list[list[torch.Tensor]]: Anchors in multiple feature levels. \ - Note that in this anchor generator, we currently only \ - support single feature level. The sizes of each tensor \ - should be [num_sizes/ranges*num_rots*featmap_size, \ + list[list[torch.Tensor]]: Anchors in multiple feature levels. + Note that in this anchor generator, we currently only + support single feature level. The sizes of each tensor + should be [num_sizes/ranges*num_rots*featmap_size, box_code_size]. """ multi_level_anchors = [] @@ -371,7 +386,7 @@ def multi_cls_grid_anchors(self, featmap_sizes, scale, device='cuda'): This function is usually called by method ``self.grid_anchors``. Args: - featmap_sizes (list[tuple]): List of feature map sizes for \ + featmap_sizes (list[tuple]): List of feature map sizes for different classes in a single feature level. scale (float): Scale factor of the anchors in the current level. device (str, optional): Device the tensor will be put on. diff --git a/mmdet3d/core/bbox/__init__.py b/mmdet3d/core/bbox/__init__.py index dd06b31c2f..8c66630682 100644 --- a/mmdet3d/core/bbox/__init__.py +++ b/mmdet3d/core/bbox/__init__.py @@ -12,7 +12,8 @@ from .structures import (BaseInstance3DBoxes, Box3DMode, CameraInstance3DBoxes, Coord3DMode, DepthInstance3DBoxes, LiDARInstance3DBoxes, get_box_type, limit_period, - mono_cam_box2vis, points_cam2img, xywhr2xyxyr) + mono_cam_box2vis, points_cam2img, points_img2cam, + xywhr2xyxyr) from .transforms import bbox3d2result, bbox3d2roi, bbox3d_mapping_back __all__ = [ @@ -25,5 +26,5 @@ 'LiDARInstance3DBoxes', 'CameraInstance3DBoxes', 'bbox3d2roi', 'bbox3d2result', 'DepthInstance3DBoxes', 'BaseInstance3DBoxes', 'bbox3d_mapping_back', 'xywhr2xyxyr', 'limit_period', 'points_cam2img', - 'get_box_type', 'Coord3DMode', 'mono_cam_box2vis' + 'points_img2cam', 'get_box_type', 'Coord3DMode', 'mono_cam_box2vis' ] diff --git a/mmdet3d/core/bbox/box_np_ops.py b/mmdet3d/core/bbox/box_np_ops.py index 6740e4ee4d..bb52bbbfcc 100644 --- a/mmdet3d/core/bbox/box_np_ops.py +++ b/mmdet3d/core/bbox/box_np_ops.py @@ -1,14 +1,21 @@ # Copyright (c) OpenMMLab. All rights reserved. # TODO: clean the functions in this file and move the APIs into box structures # in the future +# NOTICE: All functions in this file are valid for LiDAR or depth boxes only +# if we use default parameters. import numba import numpy as np +from .structures.utils import limit_period, points_cam2img, rotation_3d_in_axis + def camera_to_lidar(points, r_rect, velo2cam): """Convert points in camera coordinate to lidar coordinate. + Note: + This function is for KITTI only. + Args: points (np.ndarray, shape=[N, 3]): Points in camera coordinate. r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in @@ -27,7 +34,10 @@ def camera_to_lidar(points, r_rect, velo2cam): def box_camera_to_lidar(data, r_rect, velo2cam): - """Covert boxes in camera coordinate to lidar coordinate. + """Convert boxes in camera coordinate to lidar coordinate. + + Note: + This function is for KITTI only. Args: data (np.ndarray, shape=[N, 7]): Boxes in camera coordinate. @@ -40,10 +50,13 @@ def box_camera_to_lidar(data, r_rect, velo2cam): np.ndarray, shape=[N, 3]: Boxes in lidar coordinate. """ xyz = data[:, 0:3] - l, h, w = data[:, 3:4], data[:, 4:5], data[:, 5:6] + x_size, y_size, z_size = data[:, 3:4], data[:, 4:5], data[:, 5:6] r = data[:, 6:7] xyz_lidar = camera_to_lidar(xyz, r_rect, velo2cam) - return np.concatenate([xyz_lidar, w, l, h, r], axis=1) + # yaw and dims also needs to be converted + r_new = -r - np.pi / 2 + r_new = limit_period(r_new, period=np.pi * 2) + return np.concatenate([xyz_lidar, x_size, z_size, y_size, r_new], axis=1) def corners_nd(dims, origin=0.5): @@ -80,26 +93,9 @@ def corners_nd(dims, origin=0.5): return corners -def rotation_2d(points, angles): - """Rotation 2d points based on origin point clockwise when angle positive. - - Args: - points (np.ndarray): Points to be rotated with shape \ - (N, point_size, 2). - angles (np.ndarray): Rotation angle with shape (N). - - Returns: - np.ndarray: Same shape as points. - """ - rot_sin = np.sin(angles) - rot_cos = np.cos(angles) - rot_mat_T = np.stack([[rot_cos, -rot_sin], [rot_sin, rot_cos]]) - return np.einsum('aij,jka->aik', points, rot_mat_T) - - def center_to_corner_box2d(centers, dims, angles=None, origin=0.5): """Convert kitti locations, dimensions and angles to corners. - format: center(xy), dims(xy), angles(clockwise when positive) + format: center(xy), dims(xy), angles(counterclockwise when positive) Args: centers (np.ndarray): Locations in kitti label file with shape (N, 2). @@ -118,7 +114,7 @@ def center_to_corner_box2d(centers, dims, angles=None, origin=0.5): corners = corners_nd(dims, origin=origin) # corners: [N, 4, 2] if angles is not None: - corners = rotation_2d(corners, angles) + corners = rotation_3d_in_axis(corners, angles) corners += centers.reshape([-1, 1, 2]) return corners @@ -172,37 +168,6 @@ def depth_to_lidar_points(depth, trunc_pixel, P2, r_rect, velo2cam): return lidar_points -def rotation_3d_in_axis(points, angles, axis=0): - """Rotate points in specific axis. - - Args: - points (np.ndarray, shape=[N, point_size, 3]]): - angles (np.ndarray, shape=[N]]): - axis (int, optional): Axis to rotate at. Defaults to 0. - - Returns: - np.ndarray: Rotated points. - """ - # points: [N, point_size, 3] - rot_sin = np.sin(angles) - rot_cos = np.cos(angles) - ones = np.ones_like(rot_cos) - zeros = np.zeros_like(rot_cos) - if axis == 1: - rot_mat_T = np.stack([[rot_cos, zeros, -rot_sin], [zeros, ones, zeros], - [rot_sin, zeros, rot_cos]]) - elif axis == 2 or axis == -1: - rot_mat_T = np.stack([[rot_cos, -rot_sin, zeros], - [rot_sin, rot_cos, zeros], [zeros, zeros, ones]]) - elif axis == 0: - rot_mat_T = np.stack([[zeros, rot_cos, -rot_sin], - [zeros, rot_sin, rot_cos], [ones, zeros, zeros]]) - else: - raise ValueError('axis should in range') - - return np.einsum('aij,jka->aik', points, rot_mat_T) - - def center_to_corner_box3d(centers, dims, angles=None, @@ -225,7 +190,7 @@ def center_to_corner_box3d(centers, np.ndarray: Corners with the shape of (N, 8, 3). """ # 'length' in kitti format is in x axis. - # yzx(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar) + # yzx(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(lwh)(lidar) # center in kitti format is [0.5, 1.0, 0.5] in xyz. corners = corners_nd(dims, origin=origin) # corners: [N, 8, 3] @@ -259,8 +224,8 @@ def box2d_to_corner_jit(boxes): rot_sin = np.sin(boxes[i, -1]) rot_cos = np.cos(boxes[i, -1]) rot_mat_T[0, 0] = rot_cos - rot_mat_T[0, 1] = -rot_sin - rot_mat_T[1, 0] = rot_sin + rot_mat_T[0, 1] = rot_sin + rot_mat_T[1, 0] = -rot_sin rot_mat_T[1, 1] = rot_cos box_corners[i] = corners[i] @ rot_mat_T + boxes[i, :2] return box_corners @@ -327,15 +292,15 @@ def rotation_points_single_angle(points, angle, axis=0): rot_cos = np.cos(angle) if axis == 1: rot_mat_T = np.array( - [[rot_cos, 0, -rot_sin], [0, 1, 0], [rot_sin, 0, rot_cos]], + [[rot_cos, 0, rot_sin], [0, 1, 0], [-rot_sin, 0, rot_cos]], dtype=points.dtype) elif axis == 2 or axis == -1: rot_mat_T = np.array( - [[rot_cos, -rot_sin, 0], [rot_sin, rot_cos, 0], [0, 0, 1]], + [[rot_cos, rot_sin, 0], [-rot_sin, rot_cos, 0], [0, 0, 1]], dtype=points.dtype) elif axis == 0: rot_mat_T = np.array( - [[1, 0, 0], [0, rot_cos, -rot_sin], [0, rot_sin, rot_cos]], + [[1, 0, 0], [0, rot_cos, rot_sin], [0, -rot_sin, rot_cos]], dtype=points.dtype) else: raise ValueError('axis should in range') @@ -343,44 +308,6 @@ def rotation_points_single_angle(points, angle, axis=0): return points @ rot_mat_T, rot_mat_T -def points_cam2img(points_3d, proj_mat, with_depth=False): - """Project points in camera coordinates to image coordinates. - - Args: - points_3d (np.ndarray): Points in shape (N, 3) - proj_mat (np.ndarray): Transformation matrix between coordinates. - with_depth (bool, optional): Whether to keep depth in the output. - Defaults to False. - - Returns: - np.ndarray: Points in image coordinates with shape [N, 2]. - """ - points_shape = list(points_3d.shape) - points_shape[-1] = 1 - - assert len(proj_mat.shape) == 2, 'The dimension of the projection'\ - f' matrix should be 2 instead of {len(proj_mat.shape)}.' - d1, d2 = proj_mat.shape[:2] - assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or ( - d1 == 4 and d2 == 4), 'The shape of the projection matrix'\ - f' ({d1}*{d2}) is not supported.' - if d1 == 3: - proj_mat_expanded = np.eye(4, dtype=proj_mat.dtype) - proj_mat_expanded[:d1, :d2] = proj_mat - proj_mat = proj_mat_expanded - - points_4 = np.concatenate([points_3d, np.ones(points_shape)], axis=-1) - point_2d = points_4 @ proj_mat.T - point_2d_res = point_2d[..., :2] / point_2d[..., 2:3] - - if with_depth: - points_2d_depth = np.concatenate([point_2d_res, point_2d[..., 2:3]], - axis=-1) - return points_2d_depth - - return point_2d_res - - def box3d_to_bbox(box3d, P2): """Convert box3d in camera coordinates to bbox in image coordinates. @@ -424,7 +351,10 @@ def corner_to_surfaces_3d(corners): def points_in_rbbox(points, rbbox, z_axis=2, origin=(0.5, 0.5, 0)): - """Check points in rotated bbox and return indicces. + """Check points in rotated bbox and return indices. + + Note: + This function is for counterclockwise boxes. Args: points (np.ndarray, shape=[N, 3+dim]): Points to query. @@ -461,25 +391,9 @@ def minmax_to_corner_2d(minmax_box): return center_to_corner_box2d(center, dims, origin=0.0) -def limit_period(val, offset=0.5, period=np.pi): - """Limit the value into a period for periodic function. - - Args: - val (np.ndarray): The value to be converted. - offset (float, optional): Offset to set the value range. \ - Defaults to 0.5. - period (float, optional): Period of the value. Defaults to np.pi. - - Returns: - torch.Tensor: Value in the range of \ - [-offset * period, (1-offset) * period] - """ - return val - np.floor(val / period + offset) * period - - def create_anchors_3d_range(feature_size, anchor_range, - sizes=((1.6, 3.9, 1.56), ), + sizes=((3.9, 1.6, 1.56), ), rotations=(0, np.pi / 2), dtype=np.float32): """Create anchors 3d by range. @@ -492,14 +406,14 @@ def create_anchors_3d_range(feature_size, (x_min, y_min, z_min, x_max, y_max, z_max). sizes (list[list] | np.ndarray | torch.Tensor, optional): Anchor size with shape [N, 3], in order of x, y, z. - Defaults to ((1.6, 3.9, 1.56), ). + Defaults to ((3.9, 1.6, 1.56), ). rotations (list[float] | np.ndarray | torch.Tensor, optional): Rotations of anchors in a single feature grid. Defaults to (0, np.pi / 2). - dtype (type, optional): Data type. Default to np.float32. + dtype (type, optional): Data type. Defaults to np.float32. Returns: - np.ndarray: Range based anchors with shape of \ + np.ndarray: Range based anchors with shape of (*feature_size, num_sizes, num_rots, 7). """ anchor_range = np.array(anchor_range, dtype) @@ -550,11 +464,11 @@ def rbbox2d_to_near_bbox(rbboxes): """convert rotated bbox to nearest 'standing' or 'lying' bbox. Args: - rbboxes (np.ndarray): Rotated bboxes with shape of \ + rbboxes (np.ndarray): Rotated bboxes with shape of (N, 5(x, y, xdim, ydim, rad)). Returns: - np.ndarray: Bounding boxes with the shpae of + np.ndarray: Bounding boxes with the shape of (N, 4(xmin, ymin, xmax, ymax)). """ rots = rbboxes[..., -1] @@ -570,6 +484,9 @@ def iou_jit(boxes, query_boxes, mode='iou', eps=0.0): """Calculate box iou. Note that jit version runs ~10x faster than the box_overlaps function in mmdet3d.core.evaluation. + Note: + This function is for counterclockwise boxes. + Args: boxes (np.ndarray): Input bounding boxes with shape of (N, 4). query_boxes (np.ndarray): Query boxes with shape of (K, 4). @@ -607,7 +524,10 @@ def iou_jit(boxes, query_boxes, mode='iou', eps=0.0): def projection_matrix_to_CRT_kitti(proj): - """Split projection matrix of kitti. + """Split projection matrix of KITTI. + + Note: + This function is for KITTI only. P = C @ [R|T] C is upper triangular matrix, so we need to inverse CR and use QR @@ -633,6 +553,9 @@ def projection_matrix_to_CRT_kitti(proj): def remove_outside_points(points, rect, Trv2c, P2, image_shape): """Remove points which are outside of image. + Note: + This function is for KITTI only. + Args: points (np.ndarray, shape=[N, 3+dims]): Total points. rect (np.ndarray, shape=[4, 4]): Matrix to project points in @@ -782,8 +705,8 @@ def points_in_convex_polygon_3d_jit(points, normal_vec, d, num_surfaces) -@numba.jit -def points_in_convex_polygon_jit(points, polygon, clockwise=True): +@numba.njit +def points_in_convex_polygon_jit(points, polygon, clockwise=False): """Check points is in 2d convex polygons. True when point in polygon. Args: @@ -800,14 +723,16 @@ def points_in_convex_polygon_jit(points, polygon, clockwise=True): num_points_of_polygon = polygon.shape[1] num_points = points.shape[0] num_polygons = polygon.shape[0] - # if clockwise: - # vec1 = polygon - polygon[:, [num_points_of_polygon - 1] + - # list(range(num_points_of_polygon - 1)), :] - # else: - # vec1 = polygon[:, [num_points_of_polygon - 1] + - # list(range(num_points_of_polygon - 1)), :] - polygon - # vec1: [num_polygon, num_points_of_polygon, 2] - vec1 = np.zeros((2), dtype=polygon.dtype) + # vec for all the polygons + if clockwise: + vec1 = polygon - polygon[:, + np.array([num_points_of_polygon - 1] + list( + range(num_points_of_polygon - 1))), :] + else: + vec1 = polygon[:, + np.array([num_points_of_polygon - 1] + + list(range(num_points_of_polygon - + 1))), :] - polygon ret = np.zeros((num_points, num_polygons), dtype=np.bool_) success = True cross = 0.0 @@ -815,12 +740,9 @@ def points_in_convex_polygon_jit(points, polygon, clockwise=True): for j in range(num_polygons): success = True for k in range(num_points_of_polygon): - if clockwise: - vec1 = polygon[j, k] - polygon[j, k - 1] - else: - vec1 = polygon[j, k - 1] - polygon[j, k] - cross = vec1[1] * (polygon[j, k, 0] - points[i, 0]) - cross -= vec1[0] * (polygon[j, k, 1] - points[i, 1]) + vec = vec1[j, k] + cross = vec[1] * (polygon[j, k, 0] - points[i, 0]) + cross -= vec[0] * (polygon[j, k, 1] - points[i, 1]) if cross >= 0: success = False break @@ -839,10 +761,13 @@ def boxes3d_to_corners3d_lidar(boxes3d, bottom_center=True): |/ |/ 2 -------- 1 + Note: + This function is for LiDAR boxes only. + Args: boxes3d (np.ndarray): Boxes with shape of (N, 7) - [x, y, z, w, l, h, ry] in LiDAR coords, see the definition of ry - in KITTI dataset. + [x, y, z, x_size, y_size, z_size, ry] in LiDAR coords, + see the definition of ry in KITTI dataset. bottom_center (bool, optional): Whether z is on the bottom center of object. Defaults to True. @@ -850,19 +775,25 @@ def boxes3d_to_corners3d_lidar(boxes3d, bottom_center=True): np.ndarray: Box corners with the shape of [N, 8, 3]. """ boxes_num = boxes3d.shape[0] - w, l, h = boxes3d[:, 3], boxes3d[:, 4], boxes3d[:, 5] - x_corners = np.array( - [w / 2., -w / 2., -w / 2., w / 2., w / 2., -w / 2., -w / 2., w / 2.], - dtype=np.float32).T - y_corners = np.array( - [-l / 2., -l / 2., l / 2., l / 2., -l / 2., -l / 2., l / 2., l / 2.], - dtype=np.float32).T + x_size, y_size, z_size = boxes3d[:, 3], boxes3d[:, 4], boxes3d[:, 5] + x_corners = np.array([ + x_size / 2., -x_size / 2., -x_size / 2., x_size / 2., x_size / 2., + -x_size / 2., -x_size / 2., x_size / 2. + ], + dtype=np.float32).T + y_corners = np.array([ + -y_size / 2., -y_size / 2., y_size / 2., y_size / 2., -y_size / 2., + -y_size / 2., y_size / 2., y_size / 2. + ], + dtype=np.float32).T if bottom_center: z_corners = np.zeros((boxes_num, 8), dtype=np.float32) - z_corners[:, 4:8] = h.reshape(boxes_num, 1).repeat(4, axis=1) # (N, 8) + z_corners[:, 4:8] = z_size.reshape(boxes_num, 1).repeat( + 4, axis=1) # (N, 8) else: z_corners = np.array([ - -h / 2., -h / 2., -h / 2., -h / 2., h / 2., h / 2., h / 2., h / 2. + -z_size / 2., -z_size / 2., -z_size / 2., -z_size / 2., + z_size / 2., z_size / 2., z_size / 2., z_size / 2. ], dtype=np.float32).T @@ -870,9 +801,9 @@ def boxes3d_to_corners3d_lidar(boxes3d, bottom_center=True): zeros, ones = np.zeros( ry.size, dtype=np.float32), np.ones( ry.size, dtype=np.float32) - rot_list = np.array([[np.cos(ry), -np.sin(ry), zeros], - [np.sin(ry), np.cos(ry), zeros], [zeros, zeros, - ones]]) # (3, 3, N) + rot_list = np.array([[np.cos(ry), np.sin(ry), zeros], + [-np.sin(ry), np.cos(ry), zeros], + [zeros, zeros, ones]]) # (3, 3, N) R_list = np.transpose(rot_list, (2, 0, 1)) # (N, 3, 3) temp_corners = np.concatenate((x_corners.reshape( diff --git a/mmdet3d/core/bbox/coders/__init__.py b/mmdet3d/core/bbox/coders/__init__.py index 0a6a6c7591..b306525c0c 100644 --- a/mmdet3d/core/bbox/coders/__init__.py +++ b/mmdet3d/core/bbox/coders/__init__.py @@ -3,10 +3,17 @@ from .anchor_free_bbox_coder import AnchorFreeBBoxCoder from .centerpoint_bbox_coders import CenterPointBBoxCoder from .delta_xyzwhlr_bbox_coder import DeltaXYZWLHRBBoxCoder +from .fcos3d_bbox_coder import FCOS3DBBoxCoder from .groupfree3d_bbox_coder import GroupFree3DBBoxCoder +from .monoflex_bbox_coder import MonoFlexCoder from .partial_bin_based_bbox_coder import PartialBinBasedBBoxCoder +from .pgd_bbox_coder import PGDBBoxCoder +from .point_xyzwhlr_bbox_coder import PointXYZWHLRBBoxCoder +from .smoke_bbox_coder import SMOKECoder __all__ = [ 'build_bbox_coder', 'DeltaXYZWLHRBBoxCoder', 'PartialBinBasedBBoxCoder', - 'CenterPointBBoxCoder', 'AnchorFreeBBoxCoder', 'GroupFree3DBBoxCoder' + 'CenterPointBBoxCoder', 'AnchorFreeBBoxCoder', 'GroupFree3DBBoxCoder', + 'PointXYZWHLRBBoxCoder', 'FCOS3DBBoxCoder', 'PGDBBoxCoder', 'SMOKECoder', + 'MonoFlexCoder' ] diff --git a/mmdet3d/core/bbox/coders/anchor_free_bbox_coder.py b/mmdet3d/core/bbox/coders/anchor_free_bbox_coder.py index 812cae8573..d64f38b5c9 100644 --- a/mmdet3d/core/bbox/coders/anchor_free_bbox_coder.py +++ b/mmdet3d/core/bbox/coders/anchor_free_bbox_coder.py @@ -25,7 +25,7 @@ def encode(self, gt_bboxes_3d, gt_labels_3d): """Encode ground truth to prediction targets. Args: - gt_bboxes_3d (BaseInstance3DBoxes): Ground truth bboxes \ + gt_bboxes_3d (BaseInstance3DBoxes): Ground truth bboxes with shape (n, 7). gt_labels_3d (torch.Tensor): Ground truth classes. diff --git a/mmdet3d/core/bbox/coders/centerpoint_bbox_coders.py b/mmdet3d/core/bbox/coders/centerpoint_bbox_coders.py index 2af76ca65a..6d43a63d4b 100644 --- a/mmdet3d/core/bbox/coders/centerpoint_bbox_coders.py +++ b/mmdet3d/core/bbox/coders/centerpoint_bbox_coders.py @@ -13,12 +13,12 @@ class CenterPointBBoxCoder(BaseBBoxCoder): pc_range (list[float]): Range of point cloud. out_size_factor (int): Downsample factor of the model. voxel_size (list[float]): Size of voxel. - post_center_range (list[float]): Limit of the center. + post_center_range (list[float], optional): Limit of the center. Default: None. - max_num (int): Max number to be kept. Default: 100. - score_threshold (float): Threshold to filter boxes based on score. - Default: None. - code_size (int): Code size of bboxes. Default: 9 + max_num (int, optional): Max number to be kept. Default: 100. + score_threshold (float, optional): Threshold to filter boxes + based on score. Default: None. + code_size (int, optional): Code size of bboxes. Default: 9 """ def __init__(self, @@ -45,7 +45,8 @@ def _gather_feat(self, feats, inds, feat_masks=None): feats (torch.Tensor): Features to be transposed and gathered with the shape of [B, 2, W, H]. inds (torch.Tensor): Indexes with the shape of [B, N]. - feat_masks (torch.Tensor): Mask of the feats. Default: None. + feat_masks (torch.Tensor, optional): Mask of the feats. + Default: None. Returns: torch.Tensor: Gathered feats. @@ -64,7 +65,7 @@ def _topk(self, scores, K=80): Args: scores (torch.Tensor): scores with the shape of [B, N, W, H]. - K (int): Number to be kept. Defaults to 80. + K (int, optional): Number to be kept. Defaults to 80. Returns: tuple[torch.Tensor] @@ -135,9 +136,9 @@ def decode(self, dim (torch.Tensor): Dim of the boxes with the shape of [B, 1, W, H]. vel (torch.Tensor): Velocity with the shape of [B, 1, W, H]. - reg (torch.Tensor): Regression value of the boxes in 2D with - the shape of [B, 2, W, H]. Default: None. - task_id (int): Index of task. Default: -1. + reg (torch.Tensor, optional): Regression value of the boxes in + 2D with the shape of [B, 2, W, H]. Default: None. + task_id (int, optional): Index of task. Default: -1. Returns: list[dict]: Decoded boxes. diff --git a/mmdet3d/core/bbox/coders/delta_xyzwhlr_bbox_coder.py b/mmdet3d/core/bbox/coders/delta_xyzwhlr_bbox_coder.py index 1fe491a3ab..931e839872 100644 --- a/mmdet3d/core/bbox/coders/delta_xyzwhlr_bbox_coder.py +++ b/mmdet3d/core/bbox/coders/delta_xyzwhlr_bbox_coder.py @@ -19,9 +19,9 @@ def __init__(self, code_size=7): @staticmethod def encode(src_boxes, dst_boxes): - """Get box regression transformation deltas (dx, dy, dz, dw, dh, dl, - dr, dv*) that can be used to transform the `src_boxes` into the - `target_boxes`. + """Get box regression transformation deltas (dx, dy, dz, dx_size, + dy_size, dz_size, dr, dv*) that can be used to transform the + `src_boxes` into the `target_boxes`. Args: src_boxes (torch.Tensor): source boxes, e.g., object proposals. @@ -56,13 +56,13 @@ def encode(src_boxes, dst_boxes): @staticmethod def decode(anchors, deltas): - """Apply transformation `deltas` (dx, dy, dz, dw, dh, dl, dr, dv*) to - `boxes`. + """Apply transformation `deltas` (dx, dy, dz, dx_size, dy_size, + dz_size, dr, dv*) to `boxes`. Args: anchors (torch.Tensor): Parameters of anchors with shape (N, 7). deltas (torch.Tensor): Encoded boxes with shape - (N, 7+n) [x, y, z, w, l, h, r, velo*]. + (N, 7+n) [x, y, z, x_size, y_size, z_size, r, velo*]. Returns: torch.Tensor: Decoded boxes. diff --git a/mmdet3d/core/bbox/coders/fcos3d_bbox_coder.py b/mmdet3d/core/bbox/coders/fcos3d_bbox_coder.py new file mode 100644 index 0000000000..7cb6b1a333 --- /dev/null +++ b/mmdet3d/core/bbox/coders/fcos3d_bbox_coder.py @@ -0,0 +1,127 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch + +from mmdet.core.bbox import BaseBBoxCoder +from mmdet.core.bbox.builder import BBOX_CODERS +from ..structures import limit_period + + +@BBOX_CODERS.register_module() +class FCOS3DBBoxCoder(BaseBBoxCoder): + """Bounding box coder for FCOS3D. + + Args: + base_depths (tuple[tuple[float]]): Depth references for decode box + depth. Defaults to None. + base_dims (tuple[tuple[float]]): Dimension references for decode box + dimension. Defaults to None. + code_size (int): The dimension of boxes to be encoded. Defaults to 7. + norm_on_bbox (bool): Whether to apply normalization on the bounding + box 2D attributes. Defaults to True. + """ + + def __init__(self, + base_depths=None, + base_dims=None, + code_size=7, + norm_on_bbox=True): + super(FCOS3DBBoxCoder, self).__init__() + self.base_depths = base_depths + self.base_dims = base_dims + self.bbox_code_size = code_size + self.norm_on_bbox = norm_on_bbox + + def encode(self, gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels): + # TODO: refactor the encoder in the FCOS3D and PGD head + pass + + def decode(self, bbox, scale, stride, training, cls_score=None): + """Decode regressed results into 3D predictions. + + Note that offsets are not transformed to the projected 3D centers. + + Args: + bbox (torch.Tensor): Raw bounding box predictions in shape + [N, C, H, W]. + scale (tuple[`Scale`]): Learnable scale parameters. + stride (int): Stride for a specific feature level. + training (bool): Whether the decoding is in the training + procedure. + cls_score (torch.Tensor): Classification score map for deciding + which base depth or dim is used. Defaults to None. + + Returns: + torch.Tensor: Decoded boxes. + """ + # scale the bbox of different level + # only apply to offset, depth and size prediction + scale_offset, scale_depth, scale_size = scale[0:3] + + clone_bbox = bbox.clone() + bbox[:, :2] = scale_offset(clone_bbox[:, :2]).float() + bbox[:, 2] = scale_depth(clone_bbox[:, 2]).float() + bbox[:, 3:6] = scale_size(clone_bbox[:, 3:6]).float() + + if self.base_depths is None: + bbox[:, 2] = bbox[:, 2].exp() + elif len(self.base_depths) == 1: # only single prior + mean = self.base_depths[0][0] + std = self.base_depths[0][1] + bbox[:, 2] = mean + bbox.clone()[:, 2] * std + else: # multi-class priors + assert len(self.base_depths) == cls_score.shape[1], \ + 'The number of multi-class depth priors should be equal to ' \ + 'the number of categories.' + indices = cls_score.max(dim=1)[1] + depth_priors = cls_score.new_tensor( + self.base_depths)[indices, :].permute(0, 3, 1, 2) + mean = depth_priors[:, 0] + std = depth_priors[:, 1] + bbox[:, 2] = mean + bbox.clone()[:, 2] * std + + bbox[:, 3:6] = bbox[:, 3:6].exp() + if self.base_dims is not None: + assert len(self.base_dims) == cls_score.shape[1], \ + 'The number of anchor sizes should be equal to the number ' \ + 'of categories.' + indices = cls_score.max(dim=1)[1] + size_priors = cls_score.new_tensor( + self.base_dims)[indices, :].permute(0, 3, 1, 2) + bbox[:, 3:6] = size_priors * bbox.clone()[:, 3:6] + + assert self.norm_on_bbox is True, 'Setting norm_on_bbox to False '\ + 'has not been thoroughly tested for FCOS3D.' + if self.norm_on_bbox: + if not training: + # Note that this line is conducted only when testing + bbox[:, :2] *= stride + + return bbox + + @staticmethod + def decode_yaw(bbox, centers2d, dir_cls, dir_offset, cam2img): + """Decode yaw angle and change it from local to global.i. + + Args: + bbox (torch.Tensor): Bounding box predictions in shape + [N, C] with yaws to be decoded. + centers2d (torch.Tensor): Projected 3D-center on the image planes + corresponding to the box predictions. + dir_cls (torch.Tensor): Predicted direction classes. + dir_offset (float): Direction offset before dividing all the + directions into several classes. + cam2img (torch.Tensor): Camera intrinsic matrix in shape [4, 4]. + + Returns: + torch.Tensor: Bounding boxes with decoded yaws. + """ + if bbox.shape[0] > 0: + dir_rot = limit_period(bbox[..., 6] - dir_offset, 0, np.pi) + bbox[..., 6] = \ + dir_rot + dir_offset + np.pi * dir_cls.to(bbox.dtype) + + bbox[:, 6] = torch.atan2(centers2d[:, 0] - cam2img[0, 2], + cam2img[0, 0]) + bbox[:, 6] + + return bbox diff --git a/mmdet3d/core/bbox/coders/groupfree3d_bbox_coder.py b/mmdet3d/core/bbox/coders/groupfree3d_bbox_coder.py index e20de72fb1..08d83e92c7 100644 --- a/mmdet3d/core/bbox/coders/groupfree3d_bbox_coder.py +++ b/mmdet3d/core/bbox/coders/groupfree3d_bbox_coder.py @@ -14,9 +14,10 @@ class GroupFree3DBBoxCoder(PartialBinBasedBBoxCoder): num_dir_bins (int): Number of bins to encode direction angle. num_sizes (int): Number of size clusters. mean_sizes (list[list[int]]): Mean size of bboxes in each class. - with_rot (bool): Whether the bbox is with rotation. Defaults to True. - size_cls_agnostic (bool): Whether the predicted size is class-agnostic. + with_rot (bool, optional): Whether the bbox is with rotation. Defaults to True. + size_cls_agnostic (bool, optional): Whether the predicted size is + class-agnostic. Defaults to True. """ def __init__(self, @@ -36,7 +37,7 @@ def encode(self, gt_bboxes_3d, gt_labels_3d): """Encode ground truth to prediction targets. Args: - gt_bboxes_3d (BaseInstance3DBoxes): Ground truth bboxes \ + gt_bboxes_3d (BaseInstance3DBoxes): Ground truth bboxes with shape (n, 7). gt_labels_3d (torch.Tensor): Ground truth classes. @@ -76,7 +77,7 @@ def decode(self, bbox_out, prefix=''): - size_class: predicted bbox size class. - size_res: predicted bbox size residual. - size: predicted class-agnostic bbox size - prefix (str): Decode predictions with specific prefix. + prefix (str, optional): Decode predictions with specific prefix. Defaults to ''. Returns: @@ -122,7 +123,7 @@ def split_pred(self, cls_preds, reg_preds, base_xyz, prefix=''): cls_preds (torch.Tensor): Class predicted features to split. reg_preds (torch.Tensor): Regression predicted features to split. base_xyz (torch.Tensor): Coordinates of points. - prefix (str): Decode predictions with specific prefix. + prefix (str, optional): Decode predictions with specific prefix. Defaults to ''. Returns: diff --git a/mmdet3d/core/bbox/coders/monoflex_bbox_coder.py b/mmdet3d/core/bbox/coders/monoflex_bbox_coder.py new file mode 100644 index 0000000000..e2ada29acd --- /dev/null +++ b/mmdet3d/core/bbox/coders/monoflex_bbox_coder.py @@ -0,0 +1,515 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch +from torch.nn import functional as F + +from mmdet.core.bbox import BaseBBoxCoder +from mmdet.core.bbox.builder import BBOX_CODERS + + +@BBOX_CODERS.register_module() +class MonoFlexCoder(BaseBBoxCoder): + """Bbox Coder for MonoFlex. + + Args: + depth_mode (str): The mode for depth calculation. + Available options are "linear", "inv_sigmoid", and "exp". + base_depth (tuple[float]): References for decoding box depth. + depth_range (list): Depth range of predicted depth. + combine_depth (bool): Whether to use combined depth (direct depth + and depth from keypoints) or use direct depth only. + uncertainty_range (list): Uncertainty range of predicted depth. + base_dims (tuple[tuple[float]]): Dimensions mean and std of decode bbox + dimensions [l, h, w] for each category. + dims_mode (str): The mode for dimension calculation. + Available options are "linear" and "exp". + multibin (bool): Whether to use multibin representation. + num_dir_bins (int): Number of Number of bins to encode + direction angle. + bin_centers (list[float]): Local yaw centers while using multibin + representations. + bin_margin (float): Margin of multibin representations. + code_size (int): The dimension of boxes to be encoded. + eps (float, optional): A value added to the denominator for numerical + stability. Default 1e-3. + """ + + def __init__(self, + depth_mode, + base_depth, + depth_range, + combine_depth, + uncertainty_range, + base_dims, + dims_mode, + multibin, + num_dir_bins, + bin_centers, + bin_margin, + code_size, + eps=1e-3): + super(MonoFlexCoder, self).__init__() + + # depth related + self.depth_mode = depth_mode + self.base_depth = base_depth + self.depth_range = depth_range + self.combine_depth = combine_depth + self.uncertainty_range = uncertainty_range + + # dimensions related + self.base_dims = base_dims + self.dims_mode = dims_mode + + # orientation related + self.multibin = multibin + self.num_dir_bins = num_dir_bins + self.bin_centers = bin_centers + self.bin_margin = bin_margin + + # output related + self.bbox_code_size = code_size + self.eps = eps + + def encode(self, gt_bboxes_3d): + """Encode ground truth to prediction targets. + + Args: + gt_bboxes_3d (`BaseInstance3DBoxes`): Ground truth 3D bboxes. + shape: (N, 7). + + Returns: + torch.Tensor: Targets of orientations. + """ + local_yaw = gt_bboxes_3d.local_yaw + # encode local yaw (-pi ~ pi) to multibin format + encode_local_yaw = local_yaw.new_zeros( + [local_yaw.shape[0], self.num_dir_bins * 2]) + bin_size = 2 * np.pi / self.num_dir_bins + margin_size = bin_size * self.bin_margin + + bin_centers = local_yaw.new_tensor(self.bin_centers) + range_size = bin_size / 2 + margin_size + + offsets = local_yaw.unsqueeze(1) - bin_centers.unsqueeze(0) + offsets[offsets > np.pi] = offsets[offsets > np.pi] - 2 * np.pi + offsets[offsets < -np.pi] = offsets[offsets < -np.pi] + 2 * np.pi + + for i in range(self.num_dir_bins): + offset = offsets[:, i] + inds = abs(offset) < range_size + encode_local_yaw[inds, i] = 1 + encode_local_yaw[inds, i + self.num_dir_bins] = offset[inds] + + orientation_target = encode_local_yaw + + return orientation_target + + def decode(self, bbox, base_centers2d, labels, downsample_ratio, cam2imgs): + """Decode bounding box regression into 3D predictions. + + Args: + bbox (Tensor): Raw bounding box predictions for each + predict center2d point. + shape: (N, C) + base_centers2d (torch.Tensor): Base centers2d for 3D bboxes. + shape: (N, 2). + labels (Tensor): Batch predict class label for each predict + center2d point. + shape: (N, ) + downsample_ratio (int): The stride of feature map. + cam2imgs (Tensor): Batch images' camera intrinsic matrix. + shape: kitti (N, 4, 4) nuscenes (N, 3, 3) + + Return: + dict: The 3D prediction dict decoded from regression map. + the dict has components below: + - bboxes2d (torch.Tensor): Decoded [x1, y1, x2, y2] format + 2D bboxes. + - dimensions (torch.Tensor): Decoded dimensions for each + object. + - offsets2d (torch.Tenosr): Offsets between base centers2d + and real centers2d. + - direct_depth (torch.Tensor): Decoded directly regressed + depth. + - keypoints2d (torch.Tensor): Keypoints of each projected + 3D box on image. + - keypoints_depth (torch.Tensor): Decoded depth from keypoints. + - combined_depth (torch.Tensor): Combined depth using direct + depth and keypoints depth with depth uncertainty. + - orientations (torch.Tensor): Multibin format orientations + (local yaw) for each objects. + """ + + # 4 dimensions for FCOS style regression + pred_bboxes2d = bbox[:, 0:4] + + # change FCOS style to [x1, y1, x2, y2] format for IOU Loss + pred_bboxes2d = self.decode_bboxes2d(pred_bboxes2d, base_centers2d) + + # 2 dimensions for projected centers2d offsets + pred_offsets2d = bbox[:, 4:6] + + # 3 dimensions for 3D bbox dimensions offsets + pred_dimensions_offsets3d = bbox[:, 29:32] + + # the first 8 dimensions are for orientation bin classification + # and the second 8 dimensions are for orientation offsets. + pred_orientations = torch.cat((bbox[:, 32:40], bbox[:, 40:48]), dim=1) + + # 3 dimensions for the uncertainties of the solved depths from + # groups of keypoints + pred_keypoints_depth_uncertainty = bbox[:, 26:29] + + # 1 dimension for the uncertainty of directly regressed depth + pred_direct_depth_uncertainty = bbox[:, 49:50].squeeze(-1) + + # 2 dimension of offsets x keypoints (8 corners + top/bottom center) + pred_keypoints2d = bbox[:, 6:26].reshape(-1, 10, 2) + + # 1 dimension for depth offsets + pred_direct_depth_offsets = bbox[:, 48:49].squeeze(-1) + + # decode the pred residual dimensions to real dimensions + pred_dimensions = self.decode_dims(labels, pred_dimensions_offsets3d) + pred_direct_depth = self.decode_direct_depth(pred_direct_depth_offsets) + pred_keypoints_depth = self.keypoints2depth(pred_keypoints2d, + pred_dimensions, cam2imgs, + downsample_ratio) + + pred_direct_depth_uncertainty = torch.clamp( + pred_direct_depth_uncertainty, self.uncertainty_range[0], + self.uncertainty_range[1]) + pred_keypoints_depth_uncertainty = torch.clamp( + pred_keypoints_depth_uncertainty, self.uncertainty_range[0], + self.uncertainty_range[1]) + + if self.combine_depth: + pred_depth_uncertainty = torch.cat( + (pred_direct_depth_uncertainty.unsqueeze(-1), + pred_keypoints_depth_uncertainty), + dim=1).exp() + pred_depth = torch.cat( + (pred_direct_depth.unsqueeze(-1), pred_keypoints_depth), dim=1) + pred_combined_depth = \ + self.combine_depths(pred_depth, pred_depth_uncertainty) + else: + pred_combined_depth = None + + preds = dict( + bboxes2d=pred_bboxes2d, + dimensions=pred_dimensions, + offsets2d=pred_offsets2d, + keypoints2d=pred_keypoints2d, + orientations=pred_orientations, + direct_depth=pred_direct_depth, + keypoints_depth=pred_keypoints_depth, + combined_depth=pred_combined_depth, + direct_depth_uncertainty=pred_direct_depth_uncertainty, + keypoints_depth_uncertainty=pred_keypoints_depth_uncertainty, + ) + + return preds + + def decode_direct_depth(self, depth_offsets): + """Transform depth offset to directly regressed depth. + + Args: + depth_offsets (torch.Tensor): Predicted depth offsets. + shape: (N, ) + + Return: + torch.Tensor: Directly regressed depth. + shape: (N, ) + """ + if self.depth_mode == 'exp': + direct_depth = depth_offsets.exp() + elif self.depth_mode == 'linear': + base_depth = depth_offsets.new_tensor(self.base_depth) + direct_depth = depth_offsets * base_depth[1] + base_depth[0] + elif self.depth_mode == 'inv_sigmoid': + direct_depth = 1 / torch.sigmoid(depth_offsets) - 1 + else: + raise ValueError + + if self.depth_range is not None: + direct_depth = torch.clamp( + direct_depth, min=self.depth_range[0], max=self.depth_range[1]) + + return direct_depth + + def decode_location(self, + base_centers2d, + offsets2d, + depths, + cam2imgs, + downsample_ratio, + pad_mode='default'): + """Retrieve object location. + + Args: + base_centers2d (torch.Tensor): predicted base centers2d. + shape: (N, 2) + offsets2d (torch.Tensor): The offsets between real centers2d + and base centers2d. + shape: (N , 2) + depths (torch.Tensor): Depths of objects. + shape: (N, ) + cam2imgs (torch.Tensor): Batch images' camera intrinsic matrix. + shape: kitti (N, 4, 4) nuscenes (N, 3, 3) + downsample_ratio (int): The stride of feature map. + pad_mode (str, optional): Padding mode used in + training data augmentation. + + Return: + tuple(torch.Tensor): Centers of 3D boxes. + shape: (N, 3) + """ + N = cam2imgs.shape[0] + # (N, 4, 4) + cam2imgs_inv = cam2imgs.inverse() + if pad_mode == 'default': + centers2d_img = (base_centers2d + offsets2d) * downsample_ratio + else: + raise NotImplementedError + # (N, 3) + centers2d_img = \ + torch.cat((centers2d_img, depths.unsqueeze(-1)), dim=1) + # (N, 4, 1) + centers2d_extend = \ + torch.cat((centers2d_img, centers2d_img.new_ones(N, 1)), + dim=1).unsqueeze(-1) + locations = torch.matmul(cam2imgs_inv, centers2d_extend).squeeze(-1) + + return locations[:, :3] + + def keypoints2depth(self, + keypoints2d, + dimensions, + cam2imgs, + downsample_ratio=4, + group0_index=[(7, 3), (0, 4)], + group1_index=[(2, 6), (1, 5)]): + """Decode depth form three groups of keypoints and geometry projection + model. 2D keypoints inlucding 8 coreners and top/bottom centers will be + divided into three groups which will be used to calculate three depths + of object. + + .. code-block:: none + + Group center keypoints: + + + --------------- + + /| top center /| + / | . / | + / | | / | + + ---------|----- + + + | / | | / + | / . | / + |/ bottom center |/ + + --------------- + + + Group 0 keypoints: + + 0 + + -------------- + + /| /| + / | / | + / | 5/ | + + -------------- + + + | /3 | / + | / | / + |/ |/ + + -------------- + 6 + + Group 1 keypoints: + + 4 + + -------------- + + /| /| + / | / | + / | / | + 1 + -------------- + + 7 + | / | / + | / | / + |/ |/ + 2 + -------------- + + + + Args: + keypoints2d (torch.Tensor): Keypoints of objects. + 8 vertices + top/bottom center. + shape: (N, 10, 2) + dimensions (torch.Tensor): Dimensions of objetcts. + shape: (N, 3) + cam2imgs (torch.Tensor): Batch images' camera intrinsic matrix. + shape: kitti (N, 4, 4) nuscenes (N, 3, 3) + downsample_ratio (int, opitonal): The stride of feature map. + Defaults: 4. + group0_index(list[tuple[int]], optional): Keypoints group 0 + of index to calculate the depth. + Defaults: [0, 3, 4, 7]. + group1_index(list[tuple[int]], optional): Keypoints group 1 + of index to calculate the depth. + Defaults: [1, 2, 5, 6] + + Return: + tuple(torch.Tensor): Depth computed from three groups of + keypoints (top/bottom, group0, group1) + shape: (N, 3) + """ + + pred_height_3d = dimensions[:, 1].clone() + f_u = cam2imgs[:, 0, 0] + center_height = keypoints2d[:, -2, 1] - keypoints2d[:, -1, 1] + corner_group0_height = keypoints2d[:, group0_index[0], 1] \ + - keypoints2d[:, group0_index[1], 1] + corner_group1_height = keypoints2d[:, group1_index[0], 1] \ + - keypoints2d[:, group1_index[1], 1] + center_depth = f_u * pred_height_3d / ( + F.relu(center_height) * downsample_ratio + self.eps) + corner_group0_depth = (f_u * pred_height_3d).unsqueeze(-1) / ( + F.relu(corner_group0_height) * downsample_ratio + self.eps) + corner_group1_depth = (f_u * pred_height_3d).unsqueeze(-1) / ( + F.relu(corner_group1_height) * downsample_ratio + self.eps) + + corner_group0_depth = corner_group0_depth.mean(dim=1) + corner_group1_depth = corner_group1_depth.mean(dim=1) + + keypoints_depth = torch.stack( + (center_depth, corner_group0_depth, corner_group1_depth), dim=1) + keypoints_depth = torch.clamp( + keypoints_depth, min=self.depth_range[0], max=self.depth_range[1]) + + return keypoints_depth + + def decode_dims(self, labels, dims_offset): + """Retrieve object dimensions. + + Args: + labels (torch.Tensor): Each points' category id. + shape: (N, K) + dims_offset (torch.Tensor): Dimension offsets. + shape: (N, 3) + + Returns: + torch.Tensor: Shape (N, 3) + """ + + if self.dims_mode == 'exp': + dims_offset = dims_offset.exp() + elif self.dims_mode == 'linear': + labels = labels.long() + base_dims = dims_offset.new_tensor(self.base_dims) + dims_mean = base_dims[:, :3] + dims_std = base_dims[:, 3:6] + cls_dimension_mean = dims_mean[labels, :] + cls_dimension_std = dims_std[labels, :] + dimensions = dims_offset * cls_dimension_mean + cls_dimension_std + else: + raise ValueError + + return dimensions + + def decode_orientation(self, ori_vector, locations): + """Retrieve object orientation. + + Args: + ori_vector (torch.Tensor): Local orientation vector + in [axis_cls, head_cls, sin, cos] format. + shape: (N, num_dir_bins * 4) + locations (torch.Tensor): Object location. + shape: (N, 3) + + Returns: + tuple[torch.Tensor]: yaws and local yaws of 3d bboxes. + """ + if self.multibin: + pred_bin_cls = ori_vector[:, :self.num_dir_bins * 2].view( + -1, self.num_dir_bins, 2) + pred_bin_cls = pred_bin_cls.softmax(dim=2)[..., 1] + orientations = ori_vector.new_zeros(ori_vector.shape[0]) + for i in range(self.num_dir_bins): + mask_i = (pred_bin_cls.argmax(dim=1) == i) + start_bin = self.num_dir_bins * 2 + i * 2 + end_bin = start_bin + 2 + pred_bin_offset = ori_vector[mask_i, start_bin:end_bin] + orientations[mask_i] = pred_bin_offset[:, 0].atan2( + pred_bin_offset[:, 1]) + self.bin_centers[i] + else: + axis_cls = ori_vector[:, :2].softmax(dim=1) + axis_cls = axis_cls[:, 0] < axis_cls[:, 1] + head_cls = ori_vector[:, 2:4].softmax(dim=1) + head_cls = head_cls[:, 0] < head_cls[:, 1] + # cls axis + orientations = self.bin_centers[axis_cls + head_cls * 2] + sin_cos_offset = F.normalize(ori_vector[:, 4:]) + orientations += sin_cos_offset[:, 0].atan(sin_cos_offset[:, 1]) + + locations = locations.view(-1, 3) + rays = locations[:, 0].atan2(locations[:, 2]) + local_yaws = orientations + yaws = local_yaws + rays + + larger_idx = (yaws > np.pi).nonzero(as_tuple=False) + small_idx = (yaws < -np.pi).nonzero(as_tuple=False) + if len(larger_idx) != 0: + yaws[larger_idx] -= 2 * np.pi + if len(small_idx) != 0: + yaws[small_idx] += 2 * np.pi + + larger_idx = (local_yaws > np.pi).nonzero(as_tuple=False) + small_idx = (local_yaws < -np.pi).nonzero(as_tuple=False) + if len(larger_idx) != 0: + local_yaws[larger_idx] -= 2 * np.pi + if len(small_idx) != 0: + local_yaws[small_idx] += 2 * np.pi + + return yaws, local_yaws + + def decode_bboxes2d(self, reg_bboxes2d, base_centers2d): + """Retrieve [x1, y1, x2, y2] format 2D bboxes. + + Args: + reg_bboxes2d (torch.Tensor): Predicted FCOS style + 2D bboxes. + shape: (N, 4) + base_centers2d (torch.Tensor): predicted base centers2d. + shape: (N, 2) + + Returns: + torch.Tenosr: [x1, y1, x2, y2] format 2D bboxes. + """ + centers_x = base_centers2d[:, 0] + centers_y = base_centers2d[:, 1] + + xs_min = centers_x - reg_bboxes2d[..., 0] + ys_min = centers_y - reg_bboxes2d[..., 1] + xs_max = centers_x + reg_bboxes2d[..., 2] + ys_max = centers_y + reg_bboxes2d[..., 3] + + bboxes2d = torch.stack([xs_min, ys_min, xs_max, ys_max], dim=-1) + + return bboxes2d + + def combine_depths(self, depth, depth_uncertainty): + """Combine all the prediced depths with depth uncertainty. + + Args: + depth (torch.Tensor): Predicted depths of each object. + 2D bboxes. + shape: (N, 4) + depth_uncertainty (torch.Tensor): Depth uncertainty for + each depth of each object. + shape: (N, 4) + + Returns: + torch.Tenosr: combined depth. + """ + uncertainty_weights = 1 / depth_uncertainty + uncertainty_weights = \ + uncertainty_weights / \ + uncertainty_weights.sum(dim=1, keepdim=True) + combined_depth = torch.sum(depth * uncertainty_weights, dim=1) + + return combined_depth diff --git a/mmdet3d/core/bbox/coders/partial_bin_based_bbox_coder.py b/mmdet3d/core/bbox/coders/partial_bin_based_bbox_coder.py index 9c22f2f778..ed8020d70d 100644 --- a/mmdet3d/core/bbox/coders/partial_bin_based_bbox_coder.py +++ b/mmdet3d/core/bbox/coders/partial_bin_based_bbox_coder.py @@ -29,7 +29,7 @@ def encode(self, gt_bboxes_3d, gt_labels_3d): """Encode ground truth to prediction targets. Args: - gt_bboxes_3d (BaseInstance3DBoxes): Ground truth bboxes \ + gt_bboxes_3d (BaseInstance3DBoxes): Ground truth bboxes with shape (n, 7). gt_labels_3d (torch.Tensor): Ground truth classes. diff --git a/mmdet3d/core/bbox/coders/pgd_bbox_coder.py b/mmdet3d/core/bbox/coders/pgd_bbox_coder.py new file mode 100644 index 0000000000..094ed39dce --- /dev/null +++ b/mmdet3d/core/bbox/coders/pgd_bbox_coder.py @@ -0,0 +1,128 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch +from torch.nn import functional as F + +from mmdet.core.bbox.builder import BBOX_CODERS +from .fcos3d_bbox_coder import FCOS3DBBoxCoder + + +@BBOX_CODERS.register_module() +class PGDBBoxCoder(FCOS3DBBoxCoder): + """Bounding box coder for PGD.""" + + def encode(self, gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels): + # TODO: refactor the encoder codes in the FCOS3D and PGD head + pass + + def decode_2d(self, + bbox, + scale, + stride, + max_regress_range, + training, + pred_keypoints=False, + pred_bbox2d=True): + """Decode regressed 2D attributes. + + Args: + bbox (torch.Tensor): Raw bounding box predictions in shape + [N, C, H, W]. + scale (tuple[`Scale`]): Learnable scale parameters. + stride (int): Stride for a specific feature level. + max_regress_range (int): Maximum regression range for a specific + feature level. + training (bool): Whether the decoding is in the training + procedure. + pred_keypoints (bool, optional): Whether to predict keypoints. + Defaults to False. + pred_bbox2d (bool, optional): Whether to predict 2D bounding + boxes. Defaults to False. + + Returns: + torch.Tensor: Decoded boxes. + """ + clone_bbox = bbox.clone() + if pred_keypoints: + scale_kpts = scale[3] + # 2 dimension of offsets x 8 corners of a 3D bbox + bbox[:, self.bbox_code_size:self.bbox_code_size + 16] = \ + torch.tanh(scale_kpts(clone_bbox[ + :, self.bbox_code_size:self.bbox_code_size + 16]).float()) + + if pred_bbox2d: + scale_bbox2d = scale[-1] + # The last four dimensions are offsets to four sides of a 2D bbox + bbox[:, -4:] = scale_bbox2d(clone_bbox[:, -4:]).float() + + if self.norm_on_bbox: + if pred_bbox2d: + bbox[:, -4:] = F.relu(bbox.clone()[:, -4:]) + if not training: + if pred_keypoints: + bbox[ + :, self.bbox_code_size:self.bbox_code_size + 16] *= \ + max_regress_range + if pred_bbox2d: + bbox[:, -4:] *= stride + else: + if pred_bbox2d: + bbox[:, -4:] = bbox.clone()[:, -4:].exp() + return bbox + + def decode_prob_depth(self, depth_cls_preds, depth_range, depth_unit, + division, num_depth_cls): + """Decode probabilistic depth map. + + Args: + depth_cls_preds (torch.Tensor): Depth probabilistic map in shape + [..., self.num_depth_cls] (raw output before softmax). + depth_range (tuple[float]): Range of depth estimation. + depth_unit (int): Unit of depth range division. + division (str): Depth division method. Options include 'uniform', + 'linear', 'log', 'loguniform'. + num_depth_cls (int): Number of depth classes. + + Returns: + torch.Tensor: Decoded probabilistic depth estimation. + """ + if division == 'uniform': + depth_multiplier = depth_unit * \ + depth_cls_preds.new_tensor( + list(range(num_depth_cls))).reshape([1, -1]) + prob_depth_preds = (F.softmax(depth_cls_preds.clone(), dim=-1) * + depth_multiplier).sum(dim=-1) + return prob_depth_preds + elif division == 'linear': + split_pts = depth_cls_preds.new_tensor(list( + range(num_depth_cls))).reshape([1, -1]) + depth_multiplier = depth_range[0] + ( + depth_range[1] - depth_range[0]) / \ + (num_depth_cls * (num_depth_cls - 1)) * \ + (split_pts * (split_pts+1)) + prob_depth_preds = (F.softmax(depth_cls_preds.clone(), dim=-1) * + depth_multiplier).sum(dim=-1) + return prob_depth_preds + elif division == 'log': + split_pts = depth_cls_preds.new_tensor(list( + range(num_depth_cls))).reshape([1, -1]) + start = max(depth_range[0], 1) + end = depth_range[1] + depth_multiplier = (np.log(start) + + split_pts * np.log(end / start) / + (num_depth_cls - 1)).exp() + prob_depth_preds = (F.softmax(depth_cls_preds.clone(), dim=-1) * + depth_multiplier).sum(dim=-1) + return prob_depth_preds + elif division == 'loguniform': + split_pts = depth_cls_preds.new_tensor(list( + range(num_depth_cls))).reshape([1, -1]) + start = max(depth_range[0], 1) + end = depth_range[1] + log_multiplier = np.log(start) + \ + split_pts * np.log(end / start) / (num_depth_cls - 1) + prob_depth_preds = (F.softmax(depth_cls_preds.clone(), dim=-1) * + log_multiplier).sum(dim=-1).exp() + return prob_depth_preds + else: + raise NotImplementedError diff --git a/mmdet3d/core/bbox/coders/point_xyzwhlr_bbox_coder.py b/mmdet3d/core/bbox/coders/point_xyzwhlr_bbox_coder.py new file mode 100644 index 0000000000..d246777ba6 --- /dev/null +++ b/mmdet3d/core/bbox/coders/point_xyzwhlr_bbox_coder.py @@ -0,0 +1,117 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch + +from mmdet.core.bbox import BaseBBoxCoder +from mmdet.core.bbox.builder import BBOX_CODERS + + +@BBOX_CODERS.register_module() +class PointXYZWHLRBBoxCoder(BaseBBoxCoder): + """Point based bbox coder for 3D boxes. + + Args: + code_size (int): The dimension of boxes to be encoded. + use_mean_size (bool, optional): Whether using anchors based on class. + Defaults to True. + mean_size (list[list[float]], optional): Mean size of bboxes in + each class. Defaults to None. + """ + + def __init__(self, code_size=7, use_mean_size=True, mean_size=None): + super(PointXYZWHLRBBoxCoder, self).__init__() + self.code_size = code_size + self.use_mean_size = use_mean_size + if self.use_mean_size: + self.mean_size = torch.from_numpy(np.array(mean_size)).float() + assert self.mean_size.min() > 0, \ + f'The min of mean_size should > 0, however currently it is '\ + f'{self.mean_size.min()}, please check it in your config.' + + def encode(self, gt_bboxes_3d, points, gt_labels_3d=None): + """Encode ground truth to prediction targets. + + Args: + gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth bboxes + with shape (N, 7 + C). + points (torch.Tensor): Point cloud with shape (N, 3). + gt_labels_3d (torch.Tensor, optional): Ground truth classes. + Defaults to None. + + Returns: + torch.Tensor: Encoded boxes with shape (N, 8 + C). + """ + gt_bboxes_3d[:, 3:6] = torch.clamp_min(gt_bboxes_3d[:, 3:6], min=1e-5) + + xg, yg, zg, dxg, dyg, dzg, rg, *cgs = torch.split( + gt_bboxes_3d, 1, dim=-1) + xa, ya, za = torch.split(points, 1, dim=-1) + + if self.use_mean_size: + assert gt_labels_3d.max() <= self.mean_size.shape[0] - 1, \ + f'the max gt label {gt_labels_3d.max()} is bigger than' \ + f'anchor types {self.mean_size.shape[0] - 1}.' + self.mean_size = self.mean_size.to(gt_labels_3d.device) + point_anchor_size = self.mean_size[gt_labels_3d] + dxa, dya, dza = torch.split(point_anchor_size, 1, dim=-1) + diagonal = torch.sqrt(dxa**2 + dya**2) + xt = (xg - xa) / diagonal + yt = (yg - ya) / diagonal + zt = (zg - za) / dza + dxt = torch.log(dxg / dxa) + dyt = torch.log(dyg / dya) + dzt = torch.log(dzg / dza) + else: + xt = (xg - xa) + yt = (yg - ya) + zt = (zg - za) + dxt = torch.log(dxg) + dyt = torch.log(dyg) + dzt = torch.log(dzg) + + return torch.cat( + [xt, yt, zt, dxt, dyt, dzt, + torch.cos(rg), + torch.sin(rg), *cgs], + dim=-1) + + def decode(self, box_encodings, points, pred_labels_3d=None): + """Decode predicted parts and points to bbox3d. + + Args: + box_encodings (torch.Tensor): Encoded boxes with shape (N, 8 + C). + points (torch.Tensor): Point cloud with shape (N, 3). + pred_labels_3d (torch.Tensor): Bbox predicted labels (N, M). + + Returns: + torch.Tensor: Decoded boxes with shape (N, 7 + C) + """ + xt, yt, zt, dxt, dyt, dzt, cost, sint, *cts = torch.split( + box_encodings, 1, dim=-1) + xa, ya, za = torch.split(points, 1, dim=-1) + + if self.use_mean_size: + assert pred_labels_3d.max() <= self.mean_size.shape[0] - 1, \ + f'The max pred label {pred_labels_3d.max()} is bigger than' \ + f'anchor types {self.mean_size.shape[0] - 1}.' + self.mean_size = self.mean_size.to(pred_labels_3d.device) + point_anchor_size = self.mean_size[pred_labels_3d] + dxa, dya, dza = torch.split(point_anchor_size, 1, dim=-1) + diagonal = torch.sqrt(dxa**2 + dya**2) + xg = xt * diagonal + xa + yg = yt * diagonal + ya + zg = zt * dza + za + + dxg = torch.exp(dxt) * dxa + dyg = torch.exp(dyt) * dya + dzg = torch.exp(dzt) * dza + else: + xg = xt + xa + yg = yt + ya + zg = zt + za + dxg, dyg, dzg = torch.split( + torch.exp(box_encodings[..., 3:6]), 1, dim=-1) + + rg = torch.atan2(sint, cost) + + return torch.cat([xg, yg, zg, dxg, dyg, dzg, rg, *cts], dim=-1) diff --git a/mmdet3d/core/bbox/coders/smoke_bbox_coder.py b/mmdet3d/core/bbox/coders/smoke_bbox_coder.py new file mode 100644 index 0000000000..134af3a729 --- /dev/null +++ b/mmdet3d/core/bbox/coders/smoke_bbox_coder.py @@ -0,0 +1,208 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch + +from mmdet.core.bbox import BaseBBoxCoder +from mmdet.core.bbox.builder import BBOX_CODERS + + +@BBOX_CODERS.register_module() +class SMOKECoder(BaseBBoxCoder): + """Bbox Coder for SMOKE. + + Args: + base_depth (tuple[float]): Depth references for decode box depth. + base_dims (tuple[tuple[float]]): Dimension references [l, h, w] + for decode box dimension for each category. + code_size (int): The dimension of boxes to be encoded. + """ + + def __init__(self, base_depth, base_dims, code_size): + super(SMOKECoder, self).__init__() + self.base_depth = base_depth + self.base_dims = base_dims + self.bbox_code_size = code_size + + def encode(self, locations, dimensions, orientations, input_metas): + """Encode CameraInstance3DBoxes by locations, dimensions, orientations. + + Args: + locations (Tensor): Center location for 3D boxes. + (N, 3) + dimensions (Tensor): Dimensions for 3D boxes. + shape (N, 3) + orientations (Tensor): Orientations for 3D boxes. + shape (N, 1) + input_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + + Return: + :obj:`CameraInstance3DBoxes`: 3D bboxes of batch images, + shape (N, bbox_code_size). + """ + + bboxes = torch.cat((locations, dimensions, orientations), dim=1) + assert bboxes.shape[1] == self.bbox_code_size, 'bboxes shape dose not'\ + 'match the bbox_code_size.' + batch_bboxes = input_metas[0]['box_type_3d']( + bboxes, box_dim=self.bbox_code_size) + + return batch_bboxes + + def decode(self, + reg, + points, + labels, + cam2imgs, + trans_mats, + locations=None): + """Decode regression into locations, dimensions, orientations. + + Args: + reg (Tensor): Batch regression for each predict center2d point. + shape: (batch * K (max_objs), C) + points(Tensor): Batch projected bbox centers on image plane. + shape: (batch * K (max_objs) , 2) + labels (Tensor): Batch predict class label for each predict + center2d point. + shape: (batch, K (max_objs)) + cam2imgs (Tensor): Batch images' camera intrinsic matrix. + shape: kitti (batch, 4, 4) nuscenes (batch, 3, 3) + trans_mats (Tensor): transformation matrix from original image + to feature map. + shape: (batch, 3, 3) + locations (None | Tensor): if locations is None, this function + is used to decode while inference, otherwise, it's used while + training using the ground truth 3d bbox locations. + shape: (batch * K (max_objs), 3) + + Return: + tuple(Tensor): The tuple has components below: + - locations (Tensor): Centers of 3D boxes. + shape: (batch * K (max_objs), 3) + - dimensions (Tensor): Dimensions of 3D boxes. + shape: (batch * K (max_objs), 3) + - orientations (Tensor): Orientations of 3D + boxes. + shape: (batch * K (max_objs), 1) + """ + depth_offsets = reg[:, 0] + centers2d_offsets = reg[:, 1:3] + dimensions_offsets = reg[:, 3:6] + orientations = reg[:, 6:8] + depths = self._decode_depth(depth_offsets) + # get the 3D Bounding box's center location. + pred_locations = self._decode_location(points, centers2d_offsets, + depths, cam2imgs, trans_mats) + pred_dimensions = self._decode_dimension(labels, dimensions_offsets) + if locations is None: + pred_orientations = self._decode_orientation( + orientations, pred_locations) + else: + pred_orientations = self._decode_orientation( + orientations, locations) + + return pred_locations, pred_dimensions, pred_orientations + + def _decode_depth(self, depth_offsets): + """Transform depth offset to depth.""" + base_depth = depth_offsets.new_tensor(self.base_depth) + depths = depth_offsets * base_depth[1] + base_depth[0] + + return depths + + def _decode_location(self, points, centers2d_offsets, depths, cam2imgs, + trans_mats): + """Retrieve objects location in camera coordinate based on projected + points. + + Args: + points (Tensor): Projected points on feature map in (x, y) + shape: (batch * K, 2) + centers2d_offset (Tensor): Project points offset in + (delta_x, delta_y). shape: (batch * K, 2) + depths (Tensor): Object depth z. + shape: (batch * K) + cam2imgs (Tensor): Batch camera intrinsics matrix. + shape: kitti (batch, 4, 4) nuscenes (batch, 3, 3) + trans_mats (Tensor): transformation matrix from original image + to feature map. + shape: (batch, 3, 3) + """ + # number of points + N = centers2d_offsets.shape[0] + # batch_size + N_batch = cam2imgs.shape[0] + batch_id = torch.arange(N_batch).unsqueeze(1) + obj_id = batch_id.repeat(1, N // N_batch).flatten() + trans_mats_inv = trans_mats.inverse()[obj_id] + cam2imgs_inv = cam2imgs.inverse()[obj_id] + centers2d = points + centers2d_offsets + centers2d_extend = torch.cat((centers2d, centers2d.new_ones(N, 1)), + dim=1) + # expand project points as [N, 3, 1] + centers2d_extend = centers2d_extend.unsqueeze(-1) + # transform project points back on original image + centers2d_img = torch.matmul(trans_mats_inv, centers2d_extend) + centers2d_img = centers2d_img * depths.view(N, -1, 1) + if cam2imgs.shape[1] == 4: + centers2d_img = torch.cat( + (centers2d_img, centers2d.new_ones(N, 1, 1)), dim=1) + locations = torch.matmul(cam2imgs_inv, centers2d_img).squeeze(2) + + return locations[:, :3] + + def _decode_dimension(self, labels, dims_offset): + """Transform dimension offsets to dimension according to its category. + + Args: + labels (Tensor): Each points' category id. + shape: (N, K) + dims_offset (Tensor): Dimension offsets. + shape: (N, 3) + """ + labels = labels.flatten().long() + base_dims = dims_offset.new_tensor(self.base_dims) + dims_select = base_dims[labels, :] + dimensions = dims_offset.exp() * dims_select + + return dimensions + + def _decode_orientation(self, ori_vector, locations): + """Retrieve object orientation. + + Args: + ori_vector (Tensor): Local orientation in [sin, cos] format. + shape: (N, 2) + locations (Tensor): Object location. + shape: (N, 3) + + Return: + Tensor: yaw(Orientation). Notice that the yaw's + range is [-np.pi, np.pi]. + shape:(N, 1) + """ + assert len(ori_vector) == len(locations) + locations = locations.view(-1, 3) + rays = torch.atan(locations[:, 0] / (locations[:, 2] + 1e-7)) + alphas = torch.atan(ori_vector[:, 0] / (ori_vector[:, 1] + 1e-7)) + + # get cosine value positive and negative index. + cos_pos_inds = (ori_vector[:, 1] >= 0).nonzero(as_tuple=False) + cos_neg_inds = (ori_vector[:, 1] < 0).nonzero(as_tuple=False) + + alphas[cos_pos_inds] -= np.pi / 2 + alphas[cos_neg_inds] += np.pi / 2 + # retrieve object rotation y angle. + yaws = alphas + rays + + larger_inds = (yaws > np.pi).nonzero(as_tuple=False) + small_inds = (yaws < -np.pi).nonzero(as_tuple=False) + + if len(larger_inds) != 0: + yaws[larger_inds] -= 2 * np.pi + if len(small_inds) != 0: + yaws[small_inds] += 2 * np.pi + + yaws = yaws.unsqueeze(-1) + return yaws diff --git a/mmdet3d/core/bbox/iou_calculators/iou3d_calculator.py b/mmdet3d/core/bbox/iou_calculators/iou3d_calculator.py index 7cc8af63f3..2b1d8eabb7 100644 --- a/mmdet3d/core/bbox/iou_calculators/iou3d_calculator.py +++ b/mmdet3d/core/bbox/iou_calculators/iou3d_calculator.py @@ -31,15 +31,17 @@ def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False): between each aligned pair of bboxes1 and bboxes2. Args: - bboxes1 (torch.Tensor): shape (N, 7+N) [x, y, z, h, w, l, ry, v]. - bboxes2 (torch.Tensor): shape (M, 7+N) [x, y, z, h, w, l, ry, v]. + bboxes1 (torch.Tensor): shape (N, 7+N) + [x, y, z, x_size, y_size, z_size, ry, v]. + bboxes2 (torch.Tensor): shape (M, 7+N) + [x, y, z, x_size, y_size, z_size, ry, v]. mode (str): "iou" (intersection over union) or iof (intersection over foreground). is_aligned (bool): Whether the calculation is aligned. Return: - torch.Tensor: If ``is_aligned`` is ``True``, return ious between \ - bboxes1 and bboxes2 with shape (M, N). If ``is_aligned`` is \ + torch.Tensor: If ``is_aligned`` is ``True``, return ious between + bboxes1 and bboxes2 with shape (M, N). If ``is_aligned`` is ``False``, return shape is M. """ return bbox_overlaps_nearest_3d(bboxes1, bboxes2, mode, is_aligned, @@ -74,13 +76,15 @@ def __call__(self, bboxes1, bboxes2, mode='iou'): calculate the actual 3D IoUs of boxes. Args: - bboxes1 (torch.Tensor): shape (N, 7+C) [x, y, z, h, w, l, ry]. - bboxes2 (torch.Tensor): shape (M, 7+C) [x, y, z, h, w, l, ry]. + bboxes1 (torch.Tensor): with shape (N, 7+C), + (x, y, z, x_size, y_size, z_size, ry, v*). + bboxes2 (torch.Tensor): with shape (M, 7+C), + (x, y, z, x_size, y_size, z_size, ry, v*). mode (str): "iou" (intersection over union) or iof (intersection over foreground). Return: - torch.Tensor: Bbox overlaps results of bboxes1 and bboxes2 \ + torch.Tensor: Bbox overlaps results of bboxes1 and bboxes2 with shape (M, N) (aligned mode is not supported currently). """ return bbox_overlaps_3d(bboxes1, bboxes2, mode, self.coordinate) @@ -102,7 +106,7 @@ def bbox_overlaps_nearest_3d(bboxes1, Note: This function first finds the nearest 2D boxes in bird eye view (BEV), and then calculates the 2D IoU using :meth:`bbox_overlaps`. - Ths IoU calculator :class:`BboxOverlapsNearest3D` uses this + This IoU calculator :class:`BboxOverlapsNearest3D` uses this function to calculate IoUs of boxes. If ``is_aligned`` is ``False``, then it calculates the ious between @@ -110,15 +114,17 @@ def bbox_overlaps_nearest_3d(bboxes1, aligned pair of bboxes1 and bboxes2. Args: - bboxes1 (torch.Tensor): shape (N, 7+C) [x, y, z, h, w, l, ry, v]. - bboxes2 (torch.Tensor): shape (M, 7+C) [x, y, z, h, w, l, ry, v]. + bboxes1 (torch.Tensor): with shape (N, 7+C), + (x, y, z, x_size, y_size, z_size, ry, v*). + bboxes2 (torch.Tensor): with shape (M, 7+C), + (x, y, z, x_size, y_size, z_size, ry, v*). mode (str): "iou" (intersection over union) or iof (intersection over foreground). is_aligned (bool): Whether the calculation is aligned Return: - torch.Tensor: If ``is_aligned`` is ``True``, return ious between \ - bboxes1 and bboxes2 with shape (M, N). If ``is_aligned`` is \ + torch.Tensor: If ``is_aligned`` is ``True``, return ious between + bboxes1 and bboxes2 with shape (M, N). If ``is_aligned`` is ``False``, return shape is M. """ assert bboxes1.size(-1) == bboxes2.size(-1) >= 7 @@ -148,14 +154,16 @@ def bbox_overlaps_3d(bboxes1, bboxes2, mode='iou', coordinate='camera'): calculate the actual IoUs of boxes. Args: - bboxes1 (torch.Tensor): shape (N, 7+C) [x, y, z, h, w, l, ry]. - bboxes2 (torch.Tensor): shape (M, 7+C) [x, y, z, h, w, l, ry]. + bboxes1 (torch.Tensor): with shape (N, 7+C), + (x, y, z, x_size, y_size, z_size, ry, v*). + bboxes2 (torch.Tensor): with shape (M, 7+C), + (x, y, z, x_size, y_size, z_size, ry, v*). mode (str): "iou" (intersection over union) or iof (intersection over foreground). coordinate (str): 'camera' or 'lidar' coordinate system. Return: - torch.Tensor: Bbox overlaps results of bboxes1 and bboxes2 \ + torch.Tensor: Bbox overlaps results of bboxes1 and bboxes2 with shape (M, N) (aligned mode is not supported currently). """ assert bboxes1.size(-1) == bboxes2.size(-1) >= 7 @@ -185,7 +193,7 @@ def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False): mode (str): "iou" (intersection over union) or "giou" (generalized intersection over union). is_aligned (bool, optional): If True, then m and n must be equal. - Default False. + Defaults to False. Returns: Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,) """ @@ -219,9 +227,9 @@ def axis_aligned_bbox_overlaps_3d(bboxes1, mode (str): "iou" (intersection over union) or "giou" (generalized intersection over union). is_aligned (bool, optional): If True, then m and n must be equal. - Default False. + Defaults to False. eps (float, optional): A value added to the denominator for numerical - stability. Default 1e-6. + stability. Defaults to 1e-6. Returns: Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,) @@ -250,7 +258,7 @@ def axis_aligned_bbox_overlaps_3d(bboxes1, """ assert mode in ['iou', 'giou'], f'Unsupported mode {mode}' - # Either the boxes are empty or the length of boxes's last dimenstion is 6 + # Either the boxes are empty or the length of boxes's last dimension is 6 assert (bboxes1.size(-1) == 6 or bboxes1.size(0) == 0) assert (bboxes2.size(-1) == 6 or bboxes2.size(0) == 0) diff --git a/mmdet3d/core/bbox/samplers/iou_neg_piecewise_sampler.py b/mmdet3d/core/bbox/samplers/iou_neg_piecewise_sampler.py index ecf4b653b4..cbd8483cad 100644 --- a/mmdet3d/core/bbox/samplers/iou_neg_piecewise_sampler.py +++ b/mmdet3d/core/bbox/samplers/iou_neg_piecewise_sampler.py @@ -9,8 +9,8 @@ class IoUNegPiecewiseSampler(RandomSampler): """IoU Piece-wise Sampling. - Sampling negtive proposals according to a list of IoU thresholds. - The negtive proposals are divided into several pieces according + Sampling negative proposals according to a list of IoU thresholds. + The negative proposals are divided into several pieces according to `neg_iou_piece_thrs`. And the ratio of each piece is indicated by `neg_piece_fractions`. @@ -18,11 +18,11 @@ class IoUNegPiecewiseSampler(RandomSampler): num (int): Number of proposals. pos_fraction (float): The fraction of positive proposals. neg_piece_fractions (list): A list contains fractions that indicates - the ratio of each piece of total negtive samplers. + the ratio of each piece of total negative samplers. neg_iou_piece_thrs (list): A list contains IoU thresholds that indicate the upper bound of this piece. neg_pos_ub (float): The total ratio to limit the upper bound - number of negtive samples. + number of negative samples. add_gt_as_proposals (bool): Whether to add gt as proposals. """ @@ -59,8 +59,8 @@ def _sample_neg(self, assign_result, num_expected, **kwargs): neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False) if neg_inds.numel() != 0: neg_inds = neg_inds.squeeze(1) - if len(neg_inds) <= num_expected: - return neg_inds + if len(neg_inds) <= 0: + return neg_inds.squeeze(1) else: neg_inds_choice = neg_inds.new_zeros([0]) extend_num = 0 @@ -88,12 +88,38 @@ def _sample_neg(self, assign_result, num_expected, **kwargs): neg_inds_choice = torch.cat( [neg_inds_choice, neg_inds[piece_neg_inds]], dim=0) extend_num += piece_expected_num - len(piece_neg_inds) + + # for the last piece + if piece_inds == self.neg_piece_num - 1: + extend_neg_num = num_expected - len(neg_inds_choice) + # if the numbers of nagetive samples > 0, we will + # randomly select num_expected samples in last piece + if piece_neg_inds.numel() > 0: + rand_idx = torch.randint( + low=0, + high=piece_neg_inds.numel(), + size=(extend_neg_num, )).long() + neg_inds_choice = torch.cat( + [neg_inds_choice, piece_neg_inds[rand_idx]], + dim=0) + # if the numbers of nagetive samples == 0, we will + # randomly select num_expected samples in all + # previous pieces + else: + rand_idx = torch.randint( + low=0, + high=neg_inds_choice.numel(), + size=(extend_neg_num, )).long() + neg_inds_choice = torch.cat( + [neg_inds_choice, neg_inds_choice[rand_idx]], + dim=0) else: piece_choice = self.random_choice(piece_neg_inds, piece_expected_num) neg_inds_choice = torch.cat( [neg_inds_choice, neg_inds[piece_choice]], dim=0) extend_num = 0 + assert len(neg_inds_choice) == num_expected return neg_inds_choice def sample(self, @@ -111,7 +137,7 @@ def sample(self, assign_result (:obj:`AssignResult`): Bbox assigning results. bboxes (torch.Tensor): Boxes to be sampled from. gt_bboxes (torch.Tensor): Ground truth bboxes. - gt_labels (torch.Tensor, optional): Class labels of ground truth \ + gt_labels (torch.Tensor, optional): Class labels of ground truth bboxes. Returns: @@ -145,7 +171,6 @@ def sample(self, num_expected_neg = neg_upper_bound neg_inds = self.neg_sampler._sample_neg( assign_result, num_expected_neg, bboxes=bboxes, **kwargs) - neg_inds = neg_inds.unique() sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes, assign_result, gt_flags) diff --git a/mmdet3d/core/bbox/structures/__init__.py b/mmdet3d/core/bbox/structures/__init__.py index 58c111ef43..460035a533 100644 --- a/mmdet3d/core/bbox/structures/__init__.py +++ b/mmdet3d/core/bbox/structures/__init__.py @@ -6,12 +6,13 @@ from .depth_box3d import DepthInstance3DBoxes from .lidar_box3d import LiDARInstance3DBoxes from .utils import (get_box_type, get_proj_mat_by_coord_type, limit_period, - mono_cam_box2vis, points_cam2img, rotation_3d_in_axis, - xywhr2xyxyr) + mono_cam_box2vis, points_cam2img, points_img2cam, + rotation_3d_in_axis, xywhr2xyxyr) __all__ = [ 'Box3DMode', 'BaseInstance3DBoxes', 'LiDARInstance3DBoxes', 'CameraInstance3DBoxes', 'DepthInstance3DBoxes', 'xywhr2xyxyr', 'get_box_type', 'rotation_3d_in_axis', 'limit_period', 'points_cam2img', - 'Coord3DMode', 'mono_cam_box2vis', 'get_proj_mat_by_coord_type' + 'points_img2cam', 'Coord3DMode', 'mono_cam_box2vis', + 'get_proj_mat_by_coord_type' ] diff --git a/mmdet3d/core/bbox/structures/base_box3d.py b/mmdet3d/core/bbox/structures/base_box3d.py index 9e060a4e1c..4bac8105c7 100644 --- a/mmdet3d/core/bbox/structures/base_box3d.py +++ b/mmdet3d/core/bbox/structures/base_box3d.py @@ -1,8 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. +import warnings +from abc import abstractmethod + import numpy as np import torch -from abc import abstractmethod +from mmdet3d.ops import points_in_boxes_all, points_in_boxes_part from mmdet3d.ops.iou3d import iou3d_cuda from .utils import limit_period, xywhr2xyxyr @@ -18,12 +21,12 @@ class BaseInstance3DBoxes(object): tensor (torch.Tensor | np.ndarray | list): a N x box_dim matrix. box_dim (int): Number of the dimension of a box. Each row is (x, y, z, x_size, y_size, z_size, yaw). - Default to 7. + Defaults to 7. with_yaw (bool): Whether the box is with yaw rotation. If False, the value of yaw will be set to 0 as minmax boxes. - Default to True. - origin (tuple[float]): The relative position of origin in the box. - Default to (0.5, 0.5, 0). This will guide the box be converted to + Defaults to True. + origin (tuple[float], optional): Relative position of the box origin. + Defaults to (0.5, 0.5, 0). This will guide the box be converted to (0.5, 0.5, 0) mode. Attributes: @@ -72,27 +75,29 @@ def volume(self): @property def dims(self): - """torch.Tensor: Corners of each box with size (N, 8, 3).""" + """torch.Tensor: Size dimensions of each box in shape (N, 3).""" return self.tensor[:, 3:6] @property def yaw(self): - """torch.Tensor: A vector with yaw of each box.""" + """torch.Tensor: A vector with yaw of each box in shape (N, ).""" return self.tensor[:, 6] @property def height(self): - """torch.Tensor: A vector with height of each box.""" + """torch.Tensor: A vector with height of each box in shape (N, ).""" return self.tensor[:, 5] @property def top_height(self): - """torch.Tensor: A vector with the top height of each box.""" + """torch.Tensor: + A vector with the top height of each box in shape (N, ).""" return self.bottom_height + self.height @property def bottom_height(self): - """torch.Tensor: A vector with bottom's height of each box.""" + """torch.Tensor: + A vector with bottom's height of each box in shape (N, ).""" return self.tensor[:, 2] @property @@ -100,58 +105,114 @@ def center(self): """Calculate the center of all the boxes. Note: - In the MMDetection3D's convention, the bottom center is + In MMDetection3D's convention, the bottom center is usually taken as the default center. The relative position of the centers in different kinds of boxes are different, e.g., the relative center of a boxes is (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0) in lidar. It is recommended to use ``bottom_center`` or ``gravity_center`` - for more clear usage. + for clearer usage. Returns: - torch.Tensor: A tensor with center of each box. + torch.Tensor: A tensor with center of each box in shape (N, 3). """ return self.bottom_center @property def bottom_center(self): - """torch.Tensor: A tensor with center of each box.""" + """torch.Tensor: A tensor with center of each box in shape (N, 3).""" return self.tensor[:, :3] @property def gravity_center(self): - """torch.Tensor: A tensor with center of each box.""" + """torch.Tensor: A tensor with center of each box in shape (N, 3).""" pass @property def corners(self): - """torch.Tensor: a tensor with 8 corners of each box.""" + """torch.Tensor: + a tensor with 8 corners of each box in shape (N, 8, 3).""" pass + @property + def bev(self): + """torch.Tensor: 2D BEV box of each box with rotation + in XYWHR format, in shape (N, 5).""" + return self.tensor[:, [0, 1, 3, 4, 6]] + + @property + def nearest_bev(self): + """torch.Tensor: A tensor of 2D BEV box of each box + without rotation.""" + # Obtain BEV boxes with rotation in XYWHR format + bev_rotated_boxes = self.bev + # convert the rotation to a valid range + rotations = bev_rotated_boxes[:, -1] + normed_rotations = torch.abs(limit_period(rotations, 0.5, np.pi)) + + # find the center of boxes + conditions = (normed_rotations > np.pi / 4)[..., None] + bboxes_xywh = torch.where(conditions, bev_rotated_boxes[:, + [0, 1, 3, 2]], + bev_rotated_boxes[:, :4]) + + centers = bboxes_xywh[:, :2] + dims = bboxes_xywh[:, 2:] + bev_boxes = torch.cat([centers - dims / 2, centers + dims / 2], dim=-1) + return bev_boxes + + def in_range_bev(self, box_range): + """Check whether the boxes are in the given range. + + Args: + box_range (list | torch.Tensor): the range of box + (x_min, y_min, x_max, y_max) + + Note: + The original implementation of SECOND checks whether boxes in + a range by checking whether the points are in a convex + polygon, we reduce the burden for simpler cases. + + Returns: + torch.Tensor: Whether each box is inside the reference range. + """ + in_range_flags = ((self.bev[:, 0] > box_range[0]) + & (self.bev[:, 1] > box_range[1]) + & (self.bev[:, 0] < box_range[2]) + & (self.bev[:, 1] < box_range[3])) + return in_range_flags + @abstractmethod def rotate(self, angle, points=None): - """Rotate boxes with points (optional) with the given angle or \ - rotation matrix. + """Rotate boxes with points (optional) with the given angle or rotation + matrix. Args: angle (float | torch.Tensor | np.ndarray): Rotation angle or rotation matrix. - points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, optional): + points (torch.Tensor | numpy.ndarray | + :obj:`BasePoints`, optional): Points to rotate. Defaults to None. """ pass @abstractmethod def flip(self, bev_direction='horizontal'): - """Flip the boxes in BEV along given BEV direction.""" + """Flip the boxes in BEV along given BEV direction. + + Args: + bev_direction (str, optional): Direction by which to flip. + Can be chosen from 'horizontal' and 'vertical'. + Defaults to 'horizontal'. + """ pass def translate(self, trans_vector): """Translate boxes with the given translation vector. Args: - trans_vector (torch.Tensor): Translation vector of size 1x3. + trans_vector (torch.Tensor): Translation vector of size (1, 3). """ if not isinstance(trans_vector, torch.Tensor): trans_vector = self.tensor.new_tensor(trans_vector) @@ -170,7 +231,7 @@ def in_range_3d(self, box_range): polygon, we try to reduce the burden for simpler cases. Returns: - torch.Tensor: A binary vector indicating whether each box is \ + torch.Tensor: A binary vector indicating whether each box is inside the reference range. """ in_range_flags = ((self.tensor[:, 0] > box_range[0]) @@ -181,34 +242,21 @@ def in_range_3d(self, box_range): & (self.tensor[:, 2] < box_range[5])) return in_range_flags - @abstractmethod - def in_range_bev(self, box_range): - """Check whether the boxes are in the given range. - - Args: - box_range (list | torch.Tensor): The range of box - in order of (x_min, y_min, x_max, y_max). - - Returns: - torch.Tensor: Indicating whether each box is inside \ - the reference range. - """ - pass - @abstractmethod def convert_to(self, dst, rt_mat=None): """Convert self to ``dst`` mode. Args: dst (:obj:`Box3DMode`): The target Box mode. - rt_mat (np.ndarray | torch.Tensor): The rotation and translation - matrix between different coordinates. Defaults to None. + rt_mat (np.ndarray | torch.Tensor, optional): The rotation and + translation matrix between different coordinates. + Defaults to None. The conversion from `src` coordinates to `dst` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: - :obj:`BaseInstance3DBoxes`: The converted box of the same type \ + :obj:`BaseInstance3DBoxes`: The converted box of the same type in the `dst` mode. """ pass @@ -220,28 +268,29 @@ def scale(self, scale_factor): scale_factors (float): Scale factors to scale the boxes. """ self.tensor[:, :6] *= scale_factor - self.tensor[:, 7:] *= scale_factor + self.tensor[:, 7:] *= scale_factor # velocity def limit_yaw(self, offset=0.5, period=np.pi): """Limit the yaw to a given period and offset. Args: - offset (float): The offset of the yaw. - period (float): The expected period. + offset (float, optional): The offset of the yaw. Defaults to 0.5. + period (float, optional): The expected period. Defaults to np.pi. """ self.tensor[:, 6] = limit_period(self.tensor[:, 6], offset, period) - def nonempty(self, threshold: float = 0.0): + def nonempty(self, threshold=0.0): """Find boxes that are non-empty. A box is considered empty, if either of its side is no larger than threshold. Args: - threshold (float): The threshold of minimal sizes. + threshold (float, optional): The threshold of minimal sizes. + Defaults to 0.0. Returns: - torch.Tensor: A binary vector which represents whether each \ + torch.Tensor: A binary vector which represents whether each box is empty (False) or non-empty (True). """ box = self.tensor @@ -267,8 +316,8 @@ def __getitem__(self, item): subject to Pytorch's indexing semantics. Returns: - :obj:`BaseInstance3DBoxes`: A new object of \ - :class:`BaseInstances3DBoxes` after indexing. + :obj:`BaseInstance3DBoxes`: A new object of + :class:`BaseInstance3DBoxes` after indexing. """ original_type = type(self) if isinstance(item, int): @@ -319,7 +368,7 @@ def to(self, device): device (str | :obj:`torch.device`): The name of the device. Returns: - :obj:`BaseInstance3DBoxes`: A new boxes object on the \ + :obj:`BaseInstance3DBoxes`: A new boxes object on the specific device. """ original_type = type(self) @@ -332,7 +381,7 @@ def clone(self): """Clone the Boxes. Returns: - :obj:`BaseInstance3DBoxes`: Box object with the same properties \ + :obj:`BaseInstance3DBoxes`: Box object with the same properties as self. """ original_type = type(self) @@ -363,7 +412,7 @@ def height_overlaps(cls, boxes1, boxes2, mode='iou'): Args: boxes1 (:obj:`BaseInstance3DBoxes`): Boxes 1 contain N boxes. boxes2 (:obj:`BaseInstance3DBoxes`): Boxes 2 contain M boxes. - mode (str, optional): Mode of iou calculation. Defaults to 'iou'. + mode (str, optional): Mode of IoU calculation. Defaults to 'iou'. Returns: torch.Tensor: Calculated iou of boxes. @@ -444,14 +493,14 @@ def overlaps(cls, boxes1, boxes2, mode='iou'): def new_box(self, data): """Create a new box object with data. - The new box and its tensor has the similar properties \ + The new box and its tensor has the similar properties as self and self.tensor, respectively. Args: data (torch.Tensor | numpy.array | list): Data to be copied. Returns: - :obj:`BaseInstance3DBoxes`: A new bbox object with ``data``, \ + :obj:`BaseInstance3DBoxes`: A new bbox object with ``data``, the object's other properties are similar to ``self``. """ new_tensor = self.tensor.new_tensor(data) \ @@ -459,3 +508,75 @@ def new_box(self, data): original_type = type(self) return original_type( new_tensor, box_dim=self.box_dim, with_yaw=self.with_yaw) + + def points_in_boxes_part(self, points, boxes_override=None): + """Find the box in which each point is. + + Args: + points (torch.Tensor): Points in shape (1, M, 3) or (M, 3), + 3 dimensions are (x, y, z) in LiDAR or depth coordinate. + boxes_override (torch.Tensor, optional): Boxes to override + `self.tensor`. Defaults to None. + + Returns: + torch.Tensor: The index of the first box that each point + is in, in shape (M, ). Default value is -1 + (if the point is not enclosed by any box). + + Note: + If a point is enclosed by multiple boxes, the index of the + first box will be returned. + """ + if boxes_override is not None: + boxes = boxes_override + else: + boxes = self.tensor + if points.dim() == 2: + points = points.unsqueeze(0) + box_idx = points_in_boxes_part(points, + boxes.unsqueeze(0).to( + points.device)).squeeze(0) + return box_idx + + def points_in_boxes_all(self, points, boxes_override=None): + """Find all boxes in which each point is. + + Args: + points (torch.Tensor): Points in shape (1, M, 3) or (M, 3), + 3 dimensions are (x, y, z) in LiDAR or depth coordinate. + boxes_override (torch.Tensor, optional): Boxes to override + `self.tensor`. Defaults to None. + + Returns: + torch.Tensor: A tensor indicating whether a point is in a box, + in shape (M, T). T is the number of boxes. Denote this + tensor as A, if the m^th point is in the t^th box, then + `A[m, t] == 1`, elsewise `A[m, t] == 0`. + """ + if boxes_override is not None: + boxes = boxes_override + else: + boxes = self.tensor + + points_clone = points.clone()[..., :3] + if points_clone.dim() == 2: + points_clone = points_clone.unsqueeze(0) + else: + assert points_clone.dim() == 3 and points_clone.shape[0] == 1 + + boxes = boxes.to(points_clone.device).unsqueeze(0) + box_idxs_of_pts = points_in_boxes_all(points_clone, boxes) + + return box_idxs_of_pts.squeeze(0) + + def points_in_boxes(self, points, boxes_override=None): + warnings.warn('DeprecationWarning: points_in_boxes is a ' + 'deprecated method, please consider using ' + 'points_in_boxes_part.') + return self.points_in_boxes_part(points, boxes_override) + + def points_in_boxes_batch(self, points, boxes_override=None): + warnings.warn('DeprecationWarning: points_in_boxes_batch is a ' + 'deprecated method, please consider using ' + 'points_in_boxes_all.') + return self.points_in_boxes_all(points, boxes_override) diff --git a/mmdet3d/core/bbox/structures/box_3d_mode.py b/mmdet3d/core/bbox/structures/box_3d_mode.py index 6e2db4fc26..3048b0addb 100644 --- a/mmdet3d/core/bbox/structures/box_3d_mode.py +++ b/mmdet3d/core/bbox/structures/box_3d_mode.py @@ -1,12 +1,14 @@ # Copyright (c) OpenMMLab. All rights reserved. +from enum import IntEnum, unique + import numpy as np import torch -from enum import IntEnum, unique from .base_box3d import BaseInstance3DBoxes from .cam_box3d import CameraInstance3DBoxes from .depth_box3d import DepthInstance3DBoxes from .lidar_box3d import LiDARInstance3DBoxes +from .utils import limit_period @unique @@ -61,23 +63,28 @@ class Box3DMode(IntEnum): DEPTH = 2 @staticmethod - def convert(box, src, dst, rt_mat=None): + def convert(box, src, dst, rt_mat=None, with_yaw=True): """Convert boxes from `src` mode to `dst` mode. Args: box (tuple | list | np.ndarray | - torch.Tensor | BaseInstance3DBoxes): + torch.Tensor | :obj:`BaseInstance3DBoxes`): Can be a k-tuple, k-list or an Nxk array/tensor, where k = 7. src (:obj:`Box3DMode`): The src Box mode. dst (:obj:`Box3DMode`): The target Box mode. - rt_mat (np.ndarray | torch.Tensor): The rotation and translation - matrix between different coordinates. Defaults to None. + rt_mat (np.ndarray | torch.Tensor, optional): The rotation and + translation matrix between different coordinates. + Defaults to None. The conversion from `src` coordinates to `dst` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. + with_yaw (bool, optional): If `box` is an instance of + :obj:`BaseInstance3DBoxes`, whether or not it has a yaw angle. + Defaults to True. Returns: - (tuple | list | np.ndarray | torch.Tensor | BaseInstance3DBoxes): \ + (tuple | list | np.ndarray | torch.Tensor | + :obj:`BaseInstance3DBoxes`): The converted box of the same type. """ if src == dst: @@ -100,32 +107,53 @@ def convert(box, src, dst, rt_mat=None): else: arr = box.clone() + if is_Instance3DBoxes: + with_yaw = box.with_yaw + # convert box from `src` mode to `dst` mode. x_size, y_size, z_size = arr[..., 3:4], arr[..., 4:5], arr[..., 5:6] + if with_yaw: + yaw = arr[..., 6:7] if src == Box3DMode.LIDAR and dst == Box3DMode.CAM: if rt_mat is None: rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]]) - xyz_size = torch.cat([y_size, z_size, x_size], dim=-1) + xyz_size = torch.cat([x_size, z_size, y_size], dim=-1) + if with_yaw: + yaw = -yaw - np.pi / 2 + yaw = limit_period(yaw, period=np.pi * 2) elif src == Box3DMode.CAM and dst == Box3DMode.LIDAR: if rt_mat is None: rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]]) - xyz_size = torch.cat([z_size, x_size, y_size], dim=-1) + xyz_size = torch.cat([x_size, z_size, y_size], dim=-1) + if with_yaw: + yaw = -yaw - np.pi / 2 + yaw = limit_period(yaw, period=np.pi * 2) elif src == Box3DMode.DEPTH and dst == Box3DMode.CAM: if rt_mat is None: - rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]]) + rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]]) xyz_size = torch.cat([x_size, z_size, y_size], dim=-1) + if with_yaw: + yaw = -yaw elif src == Box3DMode.CAM and dst == Box3DMode.DEPTH: if rt_mat is None: - rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]]) + rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]]) xyz_size = torch.cat([x_size, z_size, y_size], dim=-1) + if with_yaw: + yaw = -yaw elif src == Box3DMode.LIDAR and dst == Box3DMode.DEPTH: if rt_mat is None: rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]]) - xyz_size = torch.cat([y_size, x_size, z_size], dim=-1) + xyz_size = torch.cat([x_size, y_size, z_size], dim=-1) + if with_yaw: + yaw = yaw + np.pi / 2 + yaw = limit_period(yaw, period=np.pi * 2) elif src == Box3DMode.DEPTH and dst == Box3DMode.LIDAR: if rt_mat is None: rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]]) - xyz_size = torch.cat([y_size, x_size, z_size], dim=-1) + xyz_size = torch.cat([x_size, y_size, z_size], dim=-1) + if with_yaw: + yaw = yaw - np.pi / 2 + yaw = limit_period(yaw, period=np.pi * 2) else: raise NotImplementedError( f'Conversion from Box3DMode {src} to {dst} ' @@ -135,13 +163,17 @@ def convert(box, src, dst, rt_mat=None): rt_mat = arr.new_tensor(rt_mat) if rt_mat.size(1) == 4: extended_xyz = torch.cat( - [arr[:, :3], arr.new_ones(arr.size(0), 1)], dim=-1) + [arr[..., :3], arr.new_ones(arr.size(0), 1)], dim=-1) xyz = extended_xyz @ rt_mat.t() else: - xyz = arr[:, :3] @ rt_mat.t() + xyz = arr[..., :3] @ rt_mat.t() - remains = arr[..., 6:] - arr = torch.cat([xyz[:, :3], xyz_size, remains], dim=-1) + if with_yaw: + remains = arr[..., 7:] + arr = torch.cat([xyz[..., :3], xyz_size, yaw, remains], dim=-1) + else: + remains = arr[..., 6:] + arr = torch.cat([xyz[..., :3], xyz_size, remains], dim=-1) # convert arr to the original type original_type = type(box) @@ -160,7 +192,6 @@ def convert(box, src, dst, rt_mat=None): raise NotImplementedError( f'Conversion to {dst} through {original_type}' ' is not supported yet') - return target_type( - arr, box_dim=arr.size(-1), with_yaw=box.with_yaw) + return target_type(arr, box_dim=arr.size(-1), with_yaw=with_yaw) else: return arr diff --git a/mmdet3d/core/bbox/structures/cam_box3d.py b/mmdet3d/core/bbox/structures/cam_box3d.py index ba0a2cb994..b708613441 100644 --- a/mmdet3d/core/bbox/structures/cam_box3d.py +++ b/mmdet3d/core/bbox/structures/cam_box3d.py @@ -2,9 +2,9 @@ import numpy as np import torch -from mmdet3d.core.points import BasePoints +from ...points import BasePoints from .base_box3d import BaseInstance3DBoxes -from .utils import limit_period, rotation_3d_in_axis +from .utils import rotation_3d_in_axis, yaw2local class CameraInstance3DBoxes(BaseInstance3DBoxes): @@ -28,16 +28,14 @@ class CameraInstance3DBoxes(BaseInstance3DBoxes): The yaw is 0 at the positive direction of x axis, and decreases from the positive direction of x to the positive direction of z. - A refactor is ongoing to make the three coordinate systems - easier to understand and convert between each other. - Attributes: - tensor (torch.Tensor): Float matrix of N x box_dim. - box_dim (int): Integer indicates the dimension of a box + tensor (torch.Tensor): Float matrix in shape (N, box_dim). + box_dim (int): Integer indicating the dimension of a box Each row is (x, y, z, x_size, y_size, z_size, yaw, ...). - with_yaw (bool): If True, the value of yaw will be set to 0 as minmax - boxes. + with_yaw (bool): If True, the value of yaw will be set to 0 as + axis-aligned boxes tightly enclosing the original boxes. """ + YAW_AXIS = 1 def __init__(self, tensor, @@ -76,23 +74,39 @@ def __init__(self, @property def height(self): - """torch.Tensor: A vector with height of each box.""" + """torch.Tensor: A vector with height of each box in shape (N, ).""" return self.tensor[:, 4] @property def top_height(self): - """torch.Tensor: A vector with the top height of each box.""" + """torch.Tensor: + A vector with the top height of each box in shape (N, ).""" # the positive direction is down rather than up return self.bottom_height - self.height @property def bottom_height(self): - """torch.Tensor: A vector with bottom's height of each box.""" + """torch.Tensor: + A vector with bottom's height of each box in shape (N, ).""" return self.tensor[:, 1] + @property + def local_yaw(self): + """torch.Tensor: + A vector with local yaw of each box in shape (N, ). + local_yaw equals to alpha in kitti, which is commonly + used in monocular 3D object detection task, so only + :obj:`CameraInstance3DBoxes` has the property. + """ + yaw = self.yaw + loc = self.gravity_center + local_yaw = yaw2local(yaw, loc) + + return local_yaw + @property def gravity_center(self): - """torch.Tensor: A tensor with center of each box.""" + """torch.Tensor: A tensor with center of each box in shape (N, 3).""" bottom_center = self.bottom_center gravity_center = torch.zeros_like(bottom_center) gravity_center[:, [0, 2]] = bottom_center[:, [0, 2]] @@ -137,82 +151,66 @@ def corners(self): corners_norm = corners_norm - dims.new_tensor([0.5, 1, 0.5]) corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3]) - # rotate around y axis - corners = rotation_3d_in_axis(corners, self.tensor[:, 6], axis=1) + corners = rotation_3d_in_axis( + corners, self.tensor[:, 6], axis=self.YAW_AXIS) corners += self.tensor[:, :3].view(-1, 1, 3) return corners @property def bev(self): - """torch.Tensor: A n x 5 tensor of 2D BEV box of each box - with rotation in XYWHR format.""" - return self.tensor[:, [0, 2, 3, 5, 6]] - - @property - def nearest_bev(self): - """torch.Tensor: A tensor of 2D BEV box of each box - without rotation.""" - # Obtain BEV boxes with rotation in XZWHR format - bev_rotated_boxes = self.bev - # convert the rotation to a valid range - rotations = bev_rotated_boxes[:, -1] - normed_rotations = torch.abs(limit_period(rotations, 0.5, np.pi)) - - # find the center of boxes - conditions = (normed_rotations > np.pi / 4)[..., None] - bboxes_xywh = torch.where(conditions, bev_rotated_boxes[:, - [0, 1, 3, 2]], - bev_rotated_boxes[:, :4]) - - centers = bboxes_xywh[:, :2] - dims = bboxes_xywh[:, 2:] - bev_boxes = torch.cat([centers - dims / 2, centers + dims / 2], dim=-1) - return bev_boxes + """torch.Tensor: 2D BEV box of each box with rotation + in XYWHR format, in shape (N, 5).""" + bev = self.tensor[:, [0, 2, 3, 5, 6]].clone() + # positive direction of the gravity axis + # in cam coord system points to the earth + # so the bev yaw angle needs to be reversed + bev[:, -1] = -bev[:, -1] + return bev def rotate(self, angle, points=None): - """Rotate boxes with points (optional) with the given angle or \ - rotation matrix. + """Rotate boxes with points (optional) with the given angle or rotation + matrix. Args: angle (float | torch.Tensor | np.ndarray): Rotation angle or rotation matrix. - points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, optional): + points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional): Points to rotate. Defaults to None. Returns: - tuple or None: When ``points`` is None, the function returns \ - None, otherwise it returns the rotated points and the \ + tuple or None: When ``points`` is None, the function returns + None, otherwise it returns the rotated points and the rotation matrix ``rot_mat_T``. """ if not isinstance(angle, torch.Tensor): angle = self.tensor.new_tensor(angle) + assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, \ f'invalid rotation angle shape {angle.shape}' if angle.numel() == 1: - rot_sin = torch.sin(angle) - rot_cos = torch.cos(angle) - rot_mat_T = self.tensor.new_tensor([[rot_cos, 0, -rot_sin], - [0, 1, 0], - [rot_sin, 0, rot_cos]]) + self.tensor[:, 0:3], rot_mat_T = rotation_3d_in_axis( + self.tensor[:, 0:3], + angle, + axis=self.YAW_AXIS, + return_mat=True) else: rot_mat_T = angle rot_sin = rot_mat_T[2, 0] rot_cos = rot_mat_T[0, 0] angle = np.arctan2(rot_sin, rot_cos) + self.tensor[:, 0:3] = self.tensor[:, 0:3] @ rot_mat_T - self.tensor[:, :3] = self.tensor[:, :3] @ rot_mat_T self.tensor[:, 6] += angle if points is not None: if isinstance(points, torch.Tensor): points[:, :3] = points[:, :3] @ rot_mat_T elif isinstance(points, np.ndarray): - rot_mat_T = rot_mat_T.numpy() + rot_mat_T = rot_mat_T.cpu().numpy() points[:, :3] = np.dot(points[:, :3], rot_mat_T) elif isinstance(points, BasePoints): - # clockwise - points.rotate(-angle) + points.rotate(rot_mat_T) else: raise ValueError return points, rot_mat_T @@ -224,7 +222,7 @@ def flip(self, bev_direction='horizontal', points=None): Args: bev_direction (str): Flip direction (horizontal or vertical). - points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, None): + points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional): Points to flip. Defaults to None. Returns: @@ -251,28 +249,6 @@ def flip(self, bev_direction='horizontal', points=None): points.flip(bev_direction) return points - def in_range_bev(self, box_range): - """Check whether the boxes are in the given range. - - Args: - box_range (list | torch.Tensor): The range of box - (x_min, z_min, x_max, z_max). - - Note: - The original implementation of SECOND checks whether boxes in - a range by checking whether the points are in a convex - polygon, we reduce the burden for simpler cases. - - Returns: - torch.Tensor: Indicating whether each box is inside \ - the reference range. - """ - in_range_flags = ((self.tensor[:, 0] > box_range[0]) - & (self.tensor[:, 2] > box_range[1]) - & (self.tensor[:, 0] < box_range[2]) - & (self.tensor[:, 2] < box_range[3])) - return in_range_flags - @classmethod def height_overlaps(cls, boxes1, boxes2, mode='iou'): """Calculate height overlaps of two boxes. @@ -296,8 +272,8 @@ def height_overlaps(cls, boxes1, boxes2, mode='iou'): boxes2_top_height = boxes2.top_height.view(1, -1) boxes2_bottom_height = boxes2.bottom_height.view(1, -1) - # In camera coordinate system - # from up to down is the positive direction + # positive direction of the gravity axis + # in cam coord system points to the earth heighest_of_bottom = torch.min(boxes1_bottom_height, boxes2_bottom_height) lowest_of_top = torch.max(boxes1_top_height, boxes2_top_height) @@ -309,16 +285,70 @@ def convert_to(self, dst, rt_mat=None): Args: dst (:obj:`Box3DMode`): The target Box mode. - rt_mat (np.ndarray | torch.Tensor): The rotation and translation - matrix between different coordinates. Defaults to None. + rt_mat (np.ndarray | torch.Tensor, optional): The rotation and + translation matrix between different coordinates. + Defaults to None. The conversion from ``src`` coordinates to ``dst`` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: - :obj:`BaseInstance3DBoxes`: \ + :obj:`BaseInstance3DBoxes`: The converted box of the same type in the ``dst`` mode. """ from .box_3d_mode import Box3DMode return Box3DMode.convert( box=self, src=Box3DMode.CAM, dst=dst, rt_mat=rt_mat) + + def points_in_boxes_part(self, points, boxes_override=None): + """Find the box in which each point is. + + Args: + points (torch.Tensor): Points in shape (1, M, 3) or (M, 3), + 3 dimensions are (x, y, z) in LiDAR or depth coordinate. + boxes_override (torch.Tensor, optional): Boxes to override + `self.tensor `. Defaults to None. + + Returns: + torch.Tensor: The index of the box in which + each point is, in shape (M, ). Default value is -1 + (if the point is not enclosed by any box). + """ + from .coord_3d_mode import Coord3DMode + + points_lidar = Coord3DMode.convert(points, Coord3DMode.CAM, + Coord3DMode.LIDAR) + if boxes_override is not None: + boxes_lidar = boxes_override + else: + boxes_lidar = Coord3DMode.convert(self.tensor, Coord3DMode.CAM, + Coord3DMode.LIDAR) + + box_idx = super().points_in_boxes_part(points_lidar, boxes_lidar) + return box_idx + + def points_in_boxes_all(self, points, boxes_override=None): + """Find all boxes in which each point is. + + Args: + points (torch.Tensor): Points in shape (1, M, 3) or (M, 3), + 3 dimensions are (x, y, z) in LiDAR or depth coordinate. + boxes_override (torch.Tensor, optional): Boxes to override + `self.tensor `. Defaults to None. + + Returns: + torch.Tensor: The index of all boxes in which each point is, + in shape (B, M, T). + """ + from .coord_3d_mode import Coord3DMode + + points_lidar = Coord3DMode.convert(points, Coord3DMode.CAM, + Coord3DMode.LIDAR) + if boxes_override is not None: + boxes_lidar = boxes_override + else: + boxes_lidar = Coord3DMode.convert(self.tensor, Coord3DMode.CAM, + Coord3DMode.LIDAR) + + box_idx = super().points_in_boxes_all(points_lidar, boxes_lidar) + return box_idx diff --git a/mmdet3d/core/bbox/structures/coord_3d_mode.py b/mmdet3d/core/bbox/structures/coord_3d_mode.py index fd14cfe0a6..6309b65474 100644 --- a/mmdet3d/core/bbox/structures/coord_3d_mode.py +++ b/mmdet3d/core/bbox/structures/coord_3d_mode.py @@ -1,14 +1,12 @@ # Copyright (c) OpenMMLab. All rights reserved. +from enum import IntEnum, unique + import numpy as np import torch -from enum import IntEnum, unique -from mmdet3d.core.points import (BasePoints, CameraPoints, DepthPoints, - LiDARPoints) +from ...points import BasePoints, CameraPoints, DepthPoints, LiDARPoints from .base_box3d import BaseInstance3DBoxes -from .cam_box3d import CameraInstance3DBoxes -from .depth_box3d import DepthInstance3DBoxes -from .lidar_box3d import LiDARInstance3DBoxes +from .box_3d_mode import Box3DMode @unique @@ -64,119 +62,75 @@ class Coord3DMode(IntEnum): DEPTH = 2 @staticmethod - def convert(input, src, dst, rt_mat=None): - """Convert boxes or points from `src` mode to `dst` mode.""" + def convert(input, src, dst, rt_mat=None, with_yaw=True, is_point=True): + """Convert boxes or points from `src` mode to `dst` mode. + + Args: + input (tuple | list | np.ndarray | torch.Tensor | + :obj:`BaseInstance3DBoxes` | :obj:`BasePoints`): + Can be a k-tuple, k-list or an Nxk array/tensor, where k = 7. + src (:obj:`Box3DMode` | :obj:`Coord3DMode`): The source mode. + dst (:obj:`Box3DMode` | :obj:`Coord3DMode`): The target mode. + rt_mat (np.ndarray | torch.Tensor, optional): The rotation and + translation matrix between different coordinates. + Defaults to None. + The conversion from `src` coordinates to `dst` coordinates + usually comes along the change of sensors, e.g., from camera + to LiDAR. This requires a transformation matrix. + with_yaw (bool): If `box` is an instance of + :obj:`BaseInstance3DBoxes`, whether or not it has a yaw angle. + Defaults to True. + is_point (bool): If `input` is neither an instance of + :obj:`BaseInstance3DBoxes` nor an instance of + :obj:`BasePoints`, whether or not it is point data. + Defaults to True. + + Returns: + (tuple | list | np.ndarray | torch.Tensor | + :obj:`BaseInstance3DBoxes` | :obj:`BasePoints`): + The converted box of the same type. + """ if isinstance(input, BaseInstance3DBoxes): - return Coord3DMode.convert_box(input, src, dst, rt_mat=rt_mat) + return Coord3DMode.convert_box( + input, src, dst, rt_mat=rt_mat, with_yaw=with_yaw) elif isinstance(input, BasePoints): return Coord3DMode.convert_point(input, src, dst, rt_mat=rt_mat) + elif isinstance(input, (tuple, list, np.ndarray, torch.Tensor)): + if is_point: + return Coord3DMode.convert_point( + input, src, dst, rt_mat=rt_mat) + else: + return Coord3DMode.convert_box( + input, src, dst, rt_mat=rt_mat, with_yaw=with_yaw) else: raise NotImplementedError @staticmethod - def convert_box(box, src, dst, rt_mat=None): + def convert_box(box, src, dst, rt_mat=None, with_yaw=True): """Convert boxes from `src` mode to `dst` mode. Args: box (tuple | list | np.ndarray | - torch.Tensor | BaseInstance3DBoxes): + torch.Tensor | :obj:`BaseInstance3DBoxes`): Can be a k-tuple, k-list or an Nxk array/tensor, where k = 7. - src (:obj:`CoordMode`): The src Box mode. - dst (:obj:`CoordMode`): The target Box mode. - rt_mat (np.ndarray | torch.Tensor): The rotation and translation - matrix between different coordinates. Defaults to None. + src (:obj:`Box3DMode`): The src Box mode. + dst (:obj:`Box3DMode`): The target Box mode. + rt_mat (np.ndarray | torch.Tensor, optional): The rotation and + translation matrix between different coordinates. + Defaults to None. The conversion from `src` coordinates to `dst` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. + with_yaw (bool): If `box` is an instance of + :obj:`BaseInstance3DBoxes`, whether or not it has a yaw angle. + Defaults to True. Returns: - (tuple | list | np.ndarray | torch.Tensor | BaseInstance3DBoxes): \ + (tuple | list | np.ndarray | torch.Tensor | + :obj:`BaseInstance3DBoxes`): The converted box of the same type. """ - if src == dst: - return box - - is_numpy = isinstance(box, np.ndarray) - is_Instance3DBoxes = isinstance(box, BaseInstance3DBoxes) - single_box = isinstance(box, (list, tuple)) - if single_box: - assert len(box) >= 7, ( - 'CoordMode.convert takes either a k-tuple/list or ' - 'an Nxk array/tensor, where k >= 7') - arr = torch.tensor(box)[None, :] - else: - # avoid modifying the input box - if is_numpy: - arr = torch.from_numpy(np.asarray(box)).clone() - elif is_Instance3DBoxes: - arr = box.tensor.clone() - else: - arr = box.clone() - - # convert box from `src` mode to `dst` mode. - x_size, y_size, z_size = arr[..., 3:4], arr[..., 4:5], arr[..., 5:6] - if src == Coord3DMode.LIDAR and dst == Coord3DMode.CAM: - if rt_mat is None: - rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]]) - xyz_size = torch.cat([y_size, z_size, x_size], dim=-1) - elif src == Coord3DMode.CAM and dst == Coord3DMode.LIDAR: - if rt_mat is None: - rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]]) - xyz_size = torch.cat([z_size, x_size, y_size], dim=-1) - elif src == Coord3DMode.DEPTH and dst == Coord3DMode.CAM: - if rt_mat is None: - rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]]) - xyz_size = torch.cat([x_size, z_size, y_size], dim=-1) - elif src == Coord3DMode.CAM and dst == Coord3DMode.DEPTH: - if rt_mat is None: - rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]]) - xyz_size = torch.cat([x_size, z_size, y_size], dim=-1) - elif src == Coord3DMode.LIDAR and dst == Coord3DMode.DEPTH: - if rt_mat is None: - rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]]) - xyz_size = torch.cat([y_size, x_size, z_size], dim=-1) - elif src == Coord3DMode.DEPTH and dst == Coord3DMode.LIDAR: - if rt_mat is None: - rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]]) - xyz_size = torch.cat([y_size, x_size, z_size], dim=-1) - else: - raise NotImplementedError( - f'Conversion from Coord3DMode {src} to {dst} ' - 'is not supported yet') - - if not isinstance(rt_mat, torch.Tensor): - rt_mat = arr.new_tensor(rt_mat) - if rt_mat.size(1) == 4: - extended_xyz = torch.cat( - [arr[:, :3], arr.new_ones(arr.size(0), 1)], dim=-1) - xyz = extended_xyz @ rt_mat.t() - else: - xyz = arr[:, :3] @ rt_mat.t() - - remains = arr[..., 6:] - arr = torch.cat([xyz[:, :3], xyz_size, remains], dim=-1) - - # convert arr to the original type - original_type = type(box) - if single_box: - return original_type(arr.flatten().tolist()) - if is_numpy: - return arr.numpy() - elif is_Instance3DBoxes: - if dst == Coord3DMode.CAM: - target_type = CameraInstance3DBoxes - elif dst == Coord3DMode.LIDAR: - target_type = LiDARInstance3DBoxes - elif dst == Coord3DMode.DEPTH: - target_type = DepthInstance3DBoxes - else: - raise NotImplementedError( - f'Conversion to {dst} through {original_type}' - ' is not supported yet') - return target_type( - arr, box_dim=arr.size(-1), with_yaw=box.with_yaw) - else: - return arr + return Box3DMode.convert(box, src, dst, rt_mat=rt_mat) @staticmethod def convert_point(point, src, dst, rt_mat=None): @@ -184,18 +138,19 @@ def convert_point(point, src, dst, rt_mat=None): Args: point (tuple | list | np.ndarray | - torch.Tensor | BasePoints): + torch.Tensor | :obj:`BasePoints`): Can be a k-tuple, k-list or an Nxk array/tensor. src (:obj:`CoordMode`): The src Point mode. dst (:obj:`CoordMode`): The target Point mode. - rt_mat (np.ndarray | torch.Tensor): The rotation and translation - matrix between different coordinates. Defaults to None. + rt_mat (np.ndarray | torch.Tensor, optional): The rotation and + translation matrix between different coordinates. + Defaults to None. The conversion from `src` coordinates to `dst` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: - (tuple | list | np.ndarray | torch.Tensor | BasePoints): \ + (tuple | list | np.ndarray | torch.Tensor | :obj:`BasePoints`): The converted point of the same type. """ if src == dst: @@ -219,8 +174,6 @@ def convert_point(point, src, dst, rt_mat=None): arr = point.clone() # convert point from `src` mode to `dst` mode. - # TODO: LIDAR - # only implemented provided Rt matrix in cam-depth conversion if src == Coord3DMode.LIDAR and dst == Coord3DMode.CAM: if rt_mat is None: rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]]) @@ -248,13 +201,13 @@ def convert_point(point, src, dst, rt_mat=None): rt_mat = arr.new_tensor(rt_mat) if rt_mat.size(1) == 4: extended_xyz = torch.cat( - [arr[:, :3], arr.new_ones(arr.size(0), 1)], dim=-1) + [arr[..., :3], arr.new_ones(arr.size(0), 1)], dim=-1) xyz = extended_xyz @ rt_mat.t() else: - xyz = arr[:, :3] @ rt_mat.t() + xyz = arr[..., :3] @ rt_mat.t() - remains = arr[:, 3:] - arr = torch.cat([xyz[:, :3], remains], dim=-1) + remains = arr[..., 3:] + arr = torch.cat([xyz[..., :3], remains], dim=-1) # convert arr to the original type original_type = type(point) diff --git a/mmdet3d/core/bbox/structures/depth_box3d.py b/mmdet3d/core/bbox/structures/depth_box3d.py index 50d20736ae..dd9278bfb4 100644 --- a/mmdet3d/core/bbox/structures/depth_box3d.py +++ b/mmdet3d/core/bbox/structures/depth_box3d.py @@ -3,9 +3,8 @@ import torch from mmdet3d.core.points import BasePoints -from mmdet3d.ops import points_in_boxes_batch from .base_box3d import BaseInstance3DBoxes -from .utils import limit_period, rotation_3d_in_axis +from .utils import rotation_3d_in_axis class DepthInstance3DBoxes(BaseInstance3DBoxes): @@ -38,10 +37,11 @@ class DepthInstance3DBoxes(BaseInstance3DBoxes): with_yaw (bool): If True, the value of yaw will be set to 0 as minmax boxes. """ + YAW_AXIS = 2 @property def gravity_center(self): - """torch.Tensor: A tensor with center of each box.""" + """torch.Tensor: A tensor with center of each box in shape (N, 3).""" bottom_center = self.bottom_center gravity_center = torch.zeros_like(bottom_center) gravity_center[:, :2] = bottom_center[:, :2] @@ -85,73 +85,50 @@ def corners(self): corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3]) # rotate around z axis - corners = rotation_3d_in_axis(corners, self.tensor[:, 6], axis=2) + corners = rotation_3d_in_axis( + corners, self.tensor[:, 6], axis=self.YAW_AXIS) corners += self.tensor[:, :3].view(-1, 1, 3) return corners - @property - def bev(self): - """torch.Tensor: A n x 5 tensor of 2D BEV box of each box - in XYWHR format.""" - return self.tensor[:, [0, 1, 3, 4, 6]] - - @property - def nearest_bev(self): - """torch.Tensor: A tensor of 2D BEV box of each box - without rotation.""" - # Obtain BEV boxes with rotation in XYWHR format - bev_rotated_boxes = self.bev - # convert the rotation to a valid range - rotations = bev_rotated_boxes[:, -1] - normed_rotations = torch.abs(limit_period(rotations, 0.5, np.pi)) - - # find the center of boxes - conditions = (normed_rotations > np.pi / 4)[..., None] - bboxes_xywh = torch.where(conditions, bev_rotated_boxes[:, - [0, 1, 3, 2]], - bev_rotated_boxes[:, :4]) - - centers = bboxes_xywh[:, :2] - dims = bboxes_xywh[:, 2:] - bev_boxes = torch.cat([centers - dims / 2, centers + dims / 2], dim=-1) - return bev_boxes - def rotate(self, angle, points=None): - """Rotate boxes with points (optional) with the given angle or \ - rotation matrix. + """Rotate boxes with points (optional) with the given angle or rotation + matrix. Args: angle (float | torch.Tensor | np.ndarray): Rotation angle or rotation matrix. - points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, optional): + points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional): Points to rotate. Defaults to None. Returns: - tuple or None: When ``points`` is None, the function returns \ - None, otherwise it returns the rotated points and the \ + tuple or None: When ``points`` is None, the function returns + None, otherwise it returns the rotated points and the rotation matrix ``rot_mat_T``. """ if not isinstance(angle, torch.Tensor): angle = self.tensor.new_tensor(angle) + assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, \ f'invalid rotation angle shape {angle.shape}' if angle.numel() == 1: - rot_sin = torch.sin(angle) - rot_cos = torch.cos(angle) - rot_mat_T = self.tensor.new_tensor([[rot_cos, -rot_sin, 0], - [rot_sin, rot_cos, 0], - [0, 0, 1]]).T + self.tensor[:, 0:3], rot_mat_T = rotation_3d_in_axis( + self.tensor[:, 0:3], + angle, + axis=self.YAW_AXIS, + return_mat=True) else: - rot_mat_T = angle.T + rot_mat_T = angle rot_sin = rot_mat_T[0, 1] rot_cos = rot_mat_T[0, 0] angle = np.arctan2(rot_sin, rot_cos) + self.tensor[:, 0:3] = self.tensor[:, 0:3] @ rot_mat_T - self.tensor[:, 0:3] = self.tensor[:, 0:3] @ rot_mat_T if self.with_yaw: - self.tensor[:, 6] -= angle + self.tensor[:, 6] += angle else: + # for axis-aligned boxes, we take the new + # enclosing axis-aligned boxes after rotation corners_rot = self.corners @ rot_mat_T new_x_size = corners_rot[..., 0].max( dim=1, keepdim=True)[0] - corners_rot[..., 0].min( @@ -165,11 +142,10 @@ def rotate(self, angle, points=None): if isinstance(points, torch.Tensor): points[:, :3] = points[:, :3] @ rot_mat_T elif isinstance(points, np.ndarray): - rot_mat_T = rot_mat_T.numpy() + rot_mat_T = rot_mat_T.cpu().numpy() points[:, :3] = np.dot(points[:, :3], rot_mat_T) elif isinstance(points, BasePoints): - # anti-clockwise - points.rotate(angle) + points.rotate(rot_mat_T) else: raise ValueError return points, rot_mat_T @@ -180,8 +156,9 @@ def flip(self, bev_direction='horizontal', points=None): In Depth coordinates, it flips x (horizontal) or y (vertical) axis. Args: - bev_direction (str): Flip direction (horizontal or vertical). - points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, None): + bev_direction (str, optional): Flip direction + (horizontal or vertical). Defaults to 'horizontal'. + points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional): Points to flip. Defaults to None. Returns: @@ -208,75 +185,26 @@ def flip(self, bev_direction='horizontal', points=None): points.flip(bev_direction) return points - def in_range_bev(self, box_range): - """Check whether the boxes are in the given range. - - Args: - box_range (list | torch.Tensor): The range of box - (x_min, y_min, x_max, y_max). - - Note: - In the original implementation of SECOND, checking whether - a box in the range checks whether the points are in a convex - polygon, we try to reduce the burdun for simpler cases. - - Returns: - torch.Tensor: Indicating whether each box is inside \ - the reference range. - """ - in_range_flags = ((self.tensor[:, 0] > box_range[0]) - & (self.tensor[:, 1] > box_range[1]) - & (self.tensor[:, 0] < box_range[2]) - & (self.tensor[:, 1] < box_range[3])) - return in_range_flags - def convert_to(self, dst, rt_mat=None): """Convert self to ``dst`` mode. Args: dst (:obj:`Box3DMode`): The target Box mode. - rt_mat (np.ndarray | torch.Tensor): The rotation and translation - matrix between different coordinates. Defaults to None. + rt_mat (np.ndarray | torch.Tensor, optional): The rotation and + translation matrix between different coordinates. + Defaults to None. The conversion from ``src`` coordinates to ``dst`` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: - :obj:`DepthInstance3DBoxes`: \ + :obj:`DepthInstance3DBoxes`: The converted box of the same type in the ``dst`` mode. """ from .box_3d_mode import Box3DMode return Box3DMode.convert( box=self, src=Box3DMode.DEPTH, dst=dst, rt_mat=rt_mat) - def points_in_boxes(self, points): - """Find points that are in boxes (CUDA). - - Args: - points (torch.Tensor): Points in shape [1, M, 3] or [M, 3], \ - 3 dimensions are [x, y, z] in LiDAR coordinate. - - Returns: - torch.Tensor: The index of boxes each point lies in with shape \ - of (B, M, T). - """ - from .box_3d_mode import Box3DMode - - # to lidar - points_lidar = points.clone() - points_lidar = points_lidar[..., [1, 0, 2]] - points_lidar[..., 1] *= -1 - if points.dim() == 2: - points_lidar = points_lidar.unsqueeze(0) - else: - assert points.dim() == 3 and points_lidar.shape[0] == 1 - - boxes_lidar = self.convert_to(Box3DMode.LIDAR).tensor - boxes_lidar = boxes_lidar.to(points.device).unsqueeze(0) - box_idxs_of_pts = points_in_boxes_batch(points_lidar, boxes_lidar) - - return box_idxs_of_pts.squeeze(0) - def enlarged_box(self, extra_width): """Enlarge the length, width and height boxes. @@ -284,7 +212,7 @@ def enlarged_box(self, extra_width): extra_width (float | torch.Tensor): Extra width to enlarge the box. Returns: - :obj:`LiDARInstance3DBoxes`: Enlarged boxes. + :obj:`DepthInstance3DBoxes`: Enlarged boxes. """ enlarged_boxes = self.tensor.clone() enlarged_boxes[:, 3:6] += extra_width * 2 @@ -331,13 +259,12 @@ def get_surface_line_center(self): -1, 3) surface_rot = rot_mat_T.repeat(6, 1, 1) - surface_3d = torch.matmul( - surface_3d.unsqueeze(-2), surface_rot.transpose(2, 1)).squeeze(-2) + surface_3d = torch.matmul(surface_3d.unsqueeze(-2), + surface_rot).squeeze(-2) surface_center = center.repeat(1, 6, 1).reshape(-1, 3) + surface_3d line_rot = rot_mat_T.repeat(12, 1, 1) - line_3d = torch.matmul( - line_3d.unsqueeze(-2), line_rot.transpose(2, 1)).squeeze(-2) + line_3d = torch.matmul(line_3d.unsqueeze(-2), line_rot).squeeze(-2) line_center = center.repeat(1, 12, 1).reshape(-1, 3) + line_3d return surface_center, line_center diff --git a/mmdet3d/core/bbox/structures/lidar_box3d.py b/mmdet3d/core/bbox/structures/lidar_box3d.py index d007f59ffa..706a6c0d58 100644 --- a/mmdet3d/core/bbox/structures/lidar_box3d.py +++ b/mmdet3d/core/bbox/structures/lidar_box3d.py @@ -3,9 +3,8 @@ import torch from mmdet3d.core.points import BasePoints -from mmdet3d.ops.roiaware_pool3d import points_in_boxes_gpu from .base_box3d import BaseInstance3DBoxes -from .utils import limit_period, rotation_3d_in_axis +from .utils import rotation_3d_in_axis class LiDARInstance3DBoxes(BaseInstance3DBoxes): @@ -15,16 +14,16 @@ class LiDARInstance3DBoxes(BaseInstance3DBoxes): .. code-block:: none - up z x front (yaw=-0.5*pi) - ^ ^ - | / - | / - (yaw=-pi) left y <------ 0 -------- (yaw=0) + up z x front (yaw=0) + ^ ^ + | / + | / + (yaw=0.5*pi) left y <------ 0 The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0), and the yaw is around the z axis, thus the rotation axis=2. - The yaw is 0 at the negative direction of y axis, and decreases from - the negative direction of y to the positive direction of x. + The yaw is 0 at the positive direction of x axis, and increases from + the positive direction of x to the positive direction of y. A refactor is ongoing to make the three coordinate systems easier to understand and convert between each other. @@ -36,10 +35,11 @@ class LiDARInstance3DBoxes(BaseInstance3DBoxes): with_yaw (bool): If True, the value of yaw will be set to 0 as minmax boxes. """ + YAW_AXIS = 2 @property def gravity_center(self): - """torch.Tensor: A tensor with center of each box.""" + """torch.Tensor: A tensor with center of each box in shape (N, 3).""" bottom_center = self.bottom_center gravity_center = torch.zeros_like(bottom_center) gravity_center[:, :2] = bottom_center[:, :2] @@ -83,70 +83,45 @@ def corners(self): corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3]) # rotate around z axis - corners = rotation_3d_in_axis(corners, self.tensor[:, 6], axis=2) + corners = rotation_3d_in_axis( + corners, self.tensor[:, 6], axis=self.YAW_AXIS) corners += self.tensor[:, :3].view(-1, 1, 3) return corners - @property - def bev(self): - """torch.Tensor: 2D BEV box of each box with rotation - in XYWHR format.""" - return self.tensor[:, [0, 1, 3, 4, 6]] - - @property - def nearest_bev(self): - """torch.Tensor: A tensor of 2D BEV box of each box - without rotation.""" - # Obtain BEV boxes with rotation in XYWHR format - bev_rotated_boxes = self.bev - # convert the rotation to a valid range - rotations = bev_rotated_boxes[:, -1] - normed_rotations = torch.abs(limit_period(rotations, 0.5, np.pi)) - - # find the center of boxes - conditions = (normed_rotations > np.pi / 4)[..., None] - bboxes_xywh = torch.where(conditions, bev_rotated_boxes[:, - [0, 1, 3, 2]], - bev_rotated_boxes[:, :4]) - - centers = bboxes_xywh[:, :2] - dims = bboxes_xywh[:, 2:] - bev_boxes = torch.cat([centers - dims / 2, centers + dims / 2], dim=-1) - return bev_boxes - def rotate(self, angle, points=None): - """Rotate boxes with points (optional) with the given angle or \ - rotation matrix. + """Rotate boxes with points (optional) with the given angle or rotation + matrix. Args: angles (float | torch.Tensor | np.ndarray): Rotation angle or rotation matrix. - points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, optional): + points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional): Points to rotate. Defaults to None. Returns: - tuple or None: When ``points`` is None, the function returns \ - None, otherwise it returns the rotated points and the \ + tuple or None: When ``points`` is None, the function returns + None, otherwise it returns the rotated points and the rotation matrix ``rot_mat_T``. """ if not isinstance(angle, torch.Tensor): angle = self.tensor.new_tensor(angle) + assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, \ f'invalid rotation angle shape {angle.shape}' if angle.numel() == 1: - rot_sin = torch.sin(angle) - rot_cos = torch.cos(angle) - rot_mat_T = self.tensor.new_tensor([[rot_cos, -rot_sin, 0], - [rot_sin, rot_cos, 0], - [0, 0, 1]]) + self.tensor[:, 0:3], rot_mat_T = rotation_3d_in_axis( + self.tensor[:, 0:3], + angle, + axis=self.YAW_AXIS, + return_mat=True) else: rot_mat_T = angle - rot_sin = rot_mat_T[1, 0] + rot_sin = rot_mat_T[0, 1] rot_cos = rot_mat_T[0, 0] angle = np.arctan2(rot_sin, rot_cos) + self.tensor[:, 0:3] = self.tensor[:, 0:3] @ rot_mat_T - self.tensor[:, :3] = self.tensor[:, :3] @ rot_mat_T self.tensor[:, 6] += angle if self.tensor.shape[1] == 9: @@ -157,11 +132,10 @@ def rotate(self, angle, points=None): if isinstance(points, torch.Tensor): points[:, :3] = points[:, :3] @ rot_mat_T elif isinstance(points, np.ndarray): - rot_mat_T = rot_mat_T.numpy() + rot_mat_T = rot_mat_T.cpu().numpy() points[:, :3] = np.dot(points[:, :3], rot_mat_T) elif isinstance(points, BasePoints): - # clockwise - points.rotate(-angle) + points.rotate(rot_mat_T) else: raise ValueError return points, rot_mat_T @@ -173,7 +147,7 @@ def flip(self, bev_direction='horizontal', points=None): Args: bev_direction (str): Flip direction (horizontal or vertical). - points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, None): + points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional): Points to flip. Defaults to None. Returns: @@ -183,11 +157,11 @@ def flip(self, bev_direction='horizontal', points=None): if bev_direction == 'horizontal': self.tensor[:, 1::7] = -self.tensor[:, 1::7] if self.with_yaw: - self.tensor[:, 6] = -self.tensor[:, 6] + np.pi + self.tensor[:, 6] = -self.tensor[:, 6] elif bev_direction == 'vertical': self.tensor[:, 0::7] = -self.tensor[:, 0::7] if self.with_yaw: - self.tensor[:, 6] = -self.tensor[:, 6] + self.tensor[:, 6] = -self.tensor[:, 6] + np.pi if points is not None: assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints)) @@ -200,40 +174,20 @@ def flip(self, bev_direction='horizontal', points=None): points.flip(bev_direction) return points - def in_range_bev(self, box_range): - """Check whether the boxes are in the given range. - - Args: - box_range (list | torch.Tensor): the range of box - (x_min, y_min, x_max, y_max) - - Note: - The original implementation of SECOND checks whether boxes in - a range by checking whether the points are in a convex - polygon, we reduce the burden for simpler cases. - - Returns: - torch.Tensor: Whether each box is inside the reference range. - """ - in_range_flags = ((self.tensor[:, 0] > box_range[0]) - & (self.tensor[:, 1] > box_range[1]) - & (self.tensor[:, 0] < box_range[2]) - & (self.tensor[:, 1] < box_range[3])) - return in_range_flags - def convert_to(self, dst, rt_mat=None): """Convert self to ``dst`` mode. Args: dst (:obj:`Box3DMode`): the target Box mode - rt_mat (np.ndarray | torch.Tensor): The rotation and translation - matrix between different coordinates. Defaults to None. + rt_mat (np.ndarray | torch.Tensor, optional): The rotation and + translation matrix between different coordinates. + Defaults to None. The conversion from ``src`` coordinates to ``dst`` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: - :obj:`BaseInstance3DBoxes`: \ + :obj:`BaseInstance3DBoxes`: The converted box of the same type in the ``dst`` mode. """ from .box_3d_mode import Box3DMode @@ -254,17 +208,3 @@ def enlarged_box(self, extra_width): # bottom center z minus extra_width enlarged_boxes[:, 2] -= extra_width return self.new_box(enlarged_boxes) - - def points_in_boxes(self, points): - """Find the box which the points are in. - - Args: - points (torch.Tensor): Points in shape (N, 3). - - Returns: - torch.Tensor: The index of box where each point are in. - """ - box_idx = points_in_boxes_gpu( - points.unsqueeze(0), - self.tensor.unsqueeze(0).to(points.device)).squeeze(0) - return box_idx diff --git a/mmdet3d/core/bbox/structures/utils.py b/mmdet3d/core/bbox/structures/utils.py index 842131f57d..82a4c255dd 100644 --- a/mmdet3d/core/bbox/structures/utils.py +++ b/mmdet3d/core/bbox/structures/utils.py @@ -1,86 +1,141 @@ # Copyright (c) OpenMMLab. All rights reserved. +from logging import warning + import numpy as np import torch -from logging import warning +from mmdet3d.core.utils import array_converter + +@array_converter(apply_to=('val', )) def limit_period(val, offset=0.5, period=np.pi): """Limit the value into a period for periodic function. Args: - val (torch.Tensor): The value to be converted. - offset (float, optional): Offset to set the value range. \ + val (torch.Tensor | np.ndarray): The value to be converted. + offset (float, optional): Offset to set the value range. Defaults to 0.5. period ([type], optional): Period of the value. Defaults to np.pi. Returns: - torch.Tensor: Value in the range of \ + (torch.Tensor | np.ndarray): Value in the range of [-offset * period, (1-offset) * period] """ - return val - torch.floor(val / period + offset) * period + limited_val = val - torch.floor(val / period + offset) * period + return limited_val -def rotation_3d_in_axis(points, angles, axis=0): +@array_converter(apply_to=('points', 'angles')) +def rotation_3d_in_axis(points, + angles, + axis=0, + return_mat=False, + clockwise=False): """Rotate points by angles according to axis. Args: - points (torch.Tensor): Points of shape (N, M, 3). - angles (torch.Tensor): Vector of angles in shape (N,) + points (np.ndarray | torch.Tensor | list | tuple ): + Points of shape (N, M, 3). + angles (np.ndarray | torch.Tensor | list | tuple | float): + Vector of angles in shape (N,) axis (int, optional): The axis to be rotated. Defaults to 0. + return_mat: Whether or not return the rotation matrix (transposed). + Defaults to False. + clockwise: Whether the rotation is clockwise. Defaults to False. Raises: - ValueError: when the axis is not in range [0, 1, 2], it will \ + ValueError: when the axis is not in range [0, 1, 2], it will raise value error. Returns: - torch.Tensor: Rotated points in shape (N, M, 3) + (torch.Tensor | np.ndarray): Rotated points in shape (N, M, 3). """ + batch_free = len(points.shape) == 2 + if batch_free: + points = points[None] + + if isinstance(angles, float) or len(angles.shape) == 0: + angles = torch.full(points.shape[:1], angles) + + assert len(points.shape) == 3 and len(angles.shape) == 1 \ + and points.shape[0] == angles.shape[0], f'Incorrect shape of points ' \ + f'angles: {points.shape}, {angles.shape}' + + assert points.shape[-1] in [2, 3], \ + f'Points size should be 2 or 3 instead of {points.shape[-1]}' + rot_sin = torch.sin(angles) rot_cos = torch.cos(angles) ones = torch.ones_like(rot_cos) zeros = torch.zeros_like(rot_cos) - if axis == 1: - rot_mat_T = torch.stack([ - torch.stack([rot_cos, zeros, -rot_sin]), - torch.stack([zeros, ones, zeros]), - torch.stack([rot_sin, zeros, rot_cos]) - ]) - elif axis == 2 or axis == -1: - rot_mat_T = torch.stack([ - torch.stack([rot_cos, -rot_sin, zeros]), - torch.stack([rot_sin, rot_cos, zeros]), - torch.stack([zeros, zeros, ones]) - ]) - elif axis == 0: + + if points.shape[-1] == 3: + if axis == 1 or axis == -2: + rot_mat_T = torch.stack([ + torch.stack([rot_cos, zeros, -rot_sin]), + torch.stack([zeros, ones, zeros]), + torch.stack([rot_sin, zeros, rot_cos]) + ]) + elif axis == 2 or axis == -1: + rot_mat_T = torch.stack([ + torch.stack([rot_cos, rot_sin, zeros]), + torch.stack([-rot_sin, rot_cos, zeros]), + torch.stack([zeros, zeros, ones]) + ]) + elif axis == 0 or axis == -3: + rot_mat_T = torch.stack([ + torch.stack([ones, zeros, zeros]), + torch.stack([zeros, rot_cos, rot_sin]), + torch.stack([zeros, -rot_sin, rot_cos]) + ]) + else: + raise ValueError(f'axis should in range ' + f'[-3, -2, -1, 0, 1, 2], got {axis}') + else: rot_mat_T = torch.stack([ - torch.stack([zeros, rot_cos, -rot_sin]), - torch.stack([zeros, rot_sin, rot_cos]), - torch.stack([ones, zeros, zeros]) + torch.stack([rot_cos, rot_sin]), + torch.stack([-rot_sin, rot_cos]) ]) + + if clockwise: + rot_mat_T = rot_mat_T.transpose(0, 1) + + if points.shape[0] == 0: + points_new = points else: - raise ValueError(f'axis should in range [0, 1, 2], got {axis}') + points_new = torch.einsum('aij,jka->aik', points, rot_mat_T) + + if batch_free: + points_new = points_new.squeeze(0) - return torch.einsum('aij,jka->aik', (points, rot_mat_T)) + if return_mat: + rot_mat_T = torch.einsum('jka->ajk', rot_mat_T) + if batch_free: + rot_mat_T = rot_mat_T.squeeze(0) + return points_new, rot_mat_T + else: + return points_new +@array_converter(apply_to=('boxes_xywhr', )) def xywhr2xyxyr(boxes_xywhr): """Convert a rotated boxes in XYWHR format to XYXYR format. Args: - boxes_xywhr (torch.Tensor): Rotated boxes in XYWHR format. + boxes_xywhr (torch.Tensor | np.ndarray): Rotated boxes in XYWHR format. Returns: - torch.Tensor: Converted boxes in XYXYR format. + (torch.Tensor | np.ndarray): Converted boxes in XYXYR format. """ boxes = torch.zeros_like(boxes_xywhr) - half_w = boxes_xywhr[:, 2] / 2 - half_h = boxes_xywhr[:, 3] / 2 - - boxes[:, 0] = boxes_xywhr[:, 0] - half_w - boxes[:, 1] = boxes_xywhr[:, 1] - half_h - boxes[:, 2] = boxes_xywhr[:, 0] + half_w - boxes[:, 3] = boxes_xywhr[:, 1] + half_h - boxes[:, 4] = boxes_xywhr[:, 4] + half_w = boxes_xywhr[..., 2] / 2 + half_h = boxes_xywhr[..., 3] / 2 + + boxes[..., 0] = boxes_xywhr[..., 0] - half_w + boxes[..., 1] = boxes_xywhr[..., 1] - half_h + boxes[..., 2] = boxes_xywhr[..., 0] + half_w + boxes[..., 3] = boxes_xywhr[..., 1] + half_h + boxes[..., 4] = boxes_xywhr[..., 4] return boxes @@ -91,6 +146,10 @@ def get_box_type(box_type): box_type (str): The type of box structure. The valid value are "LiDAR", "Camera", or "Depth". + Raises: + ValueError: A ValueError is raised when `box_type` + does not belong to the three valid types. + Returns: tuple: Box type and box mode. """ @@ -113,21 +172,24 @@ def get_box_type(box_type): return box_type_3d, box_mode_3d +@array_converter(apply_to=('points_3d', 'proj_mat')) def points_cam2img(points_3d, proj_mat, with_depth=False): - """Project points from camera coordicates to image coordinates. + """Project points in camera coordinates to image coordinates. Args: - points_3d (torch.Tensor): Points in shape (N, 3). - proj_mat (torch.Tensor): Transformation matrix between coordinates. + points_3d (torch.Tensor | np.ndarray): Points in shape (N, 3) + proj_mat (torch.Tensor | np.ndarray): + Transformation matrix between coordinates. with_depth (bool, optional): Whether to keep depth in the output. Defaults to False. Returns: - torch.Tensor: Points in image coordinates with shape [N, 2]. + (torch.Tensor | np.ndarray): Points in image coordinates, + with shape [N, 2] if `with_depth=False`, else [N, 3]. """ - points_num = list(points_3d.shape)[:-1] + points_shape = list(points_3d.shape) + points_shape[-1] = 1 - points_shape = np.concatenate([points_num, [1]], axis=0).tolist() assert len(proj_mat.shape) == 2, 'The dimension of the projection'\ f' matrix should be 2 instead of {len(proj_mat.shape)}.' d1, d2 = proj_mat.shape[:2] @@ -140,17 +202,52 @@ def points_cam2img(points_3d, proj_mat, with_depth=False): proj_mat_expanded[:d1, :d2] = proj_mat proj_mat = proj_mat_expanded - # previous implementation use new_zeros, new_one yeilds better results - points_4 = torch.cat( - [points_3d, points_3d.new_ones(*points_shape)], dim=-1) - point_2d = torch.matmul(points_4, proj_mat.t()) + # previous implementation use new_zeros, new_one yields better results + points_4 = torch.cat([points_3d, points_3d.new_ones(points_shape)], dim=-1) + + point_2d = points_4 @ proj_mat.T point_2d_res = point_2d[..., :2] / point_2d[..., 2:3] if with_depth: - return torch.cat([point_2d_res, point_2d[..., 2:3]], dim=-1) + point_2d_res = torch.cat([point_2d_res, point_2d[..., 2:3]], dim=-1) + return point_2d_res +@array_converter(apply_to=('points', 'cam2img')) +def points_img2cam(points, cam2img): + """Project points in image coordinates to camera coordinates. + + Args: + points (torch.Tensor): 2.5D points in 2D images, [N, 3], + 3 corresponds with x, y in the image and depth. + cam2img (torch.Tensor): Camera intrinsic matrix. The shape can be + [3, 3], [3, 4] or [4, 4]. + + Returns: + torch.Tensor: points in 3D space. [N, 3], + 3 corresponds with x, y, z in 3D space. + """ + assert cam2img.shape[0] <= 4 + assert cam2img.shape[1] <= 4 + assert points.shape[1] == 3 + + xys = points[:, :2] + depths = points[:, 2].view(-1, 1) + unnormed_xys = torch.cat([xys * depths, depths], dim=1) + + pad_cam2img = torch.eye(4, dtype=xys.dtype, device=xys.device) + pad_cam2img[:cam2img.shape[0], :cam2img.shape[1]] = cam2img + inv_pad_cam2img = torch.inverse(pad_cam2img).transpose(0, 1) + + # Do operation in homogeneous coordinates. + num_points = unnormed_xys.shape[0] + homo_xys = torch.cat([unnormed_xys, xys.new_ones((num_points, 1))], dim=1) + points3D = torch.mm(homo_xys, inv_pad_cam2img)[:, :3] + + return points3D + + def mono_cam_box2vis(cam_box): """This is a post-processing function on the bboxes from Mono-3D task. If we want to perform projection visualization, we need to: @@ -162,9 +259,9 @@ def mono_cam_box2vis(cam_box): After applying this function, we can project and draw it on 2D images. Args: - cam_box (:obj:`CameraInstance3DBoxes`): 3D bbox in camera coordinate \ - system before conversion. Could be gt bbox loaded from dataset or \ - network prediction output. + cam_box (:obj:`CameraInstance3DBoxes`): 3D bbox in camera coordinate + system before conversion. Could be gt bbox loaded from dataset + or network prediction output. Returns: :obj:`CameraInstance3DBoxes`: Box after conversion. @@ -212,3 +309,27 @@ def get_proj_mat_by_coord_type(img_meta, coord_type): mapping = {'LIDAR': 'lidar2img', 'DEPTH': 'depth2img', 'CAMERA': 'cam2img'} assert coord_type in mapping.keys() return img_meta[mapping[coord_type]] + + +def yaw2local(yaw, loc): + """Transform global yaw to local yaw (alpha in kitti) in camera + coordinates, ranges from -pi to pi. + + Args: + yaw (torch.Tensor): A vector with local yaw of each box. + shape: (N, ) + loc (torch.Tensor): gravity center of each box. + shape: (N, 3) + + Returns: + torch.Tensor: local yaw (alpha in kitti). + """ + local_yaw = yaw - torch.atan2(loc[:, 0], loc[:, 2]) + larger_idx = (local_yaw > np.pi).nonzero(as_tuple=False) + small_idx = (local_yaw < -np.pi).nonzero(as_tuple=False) + if len(larger_idx) != 0: + local_yaw[larger_idx] -= 2 * np.pi + if len(small_idx) != 0: + local_yaw[small_idx] += 2 * np.pi + + return local_yaw diff --git a/mmdet3d/core/bbox/transforms.py b/mmdet3d/core/bbox/transforms.py index 34d838a689..8a2eb90f54 100644 --- a/mmdet3d/core/bbox/transforms.py +++ b/mmdet3d/core/bbox/transforms.py @@ -32,7 +32,7 @@ def bbox3d2roi(bbox_list): corresponding to a batch of images. Returns: - torch.Tensor: Region of interests in shape (n, c), where \ + torch.Tensor: Region of interests in shape (n, c), where the channels are in order of [batch_ind, x, y ...]. """ rois_list = [] @@ -51,10 +51,10 @@ def bbox3d2result(bboxes, scores, labels, attrs=None): """Convert detection results to a list of numpy arrays. Args: - bboxes (torch.Tensor): Bounding boxes with shape of (n, 5). - labels (torch.Tensor): Labels with shape of (n, ). - scores (torch.Tensor): Scores with shape of (n, ). - attrs (torch.Tensor, optional): Attributes with shape of (n, ). \ + bboxes (torch.Tensor): Bounding boxes with shape (N, 5). + labels (torch.Tensor): Labels with shape (N, ). + scores (torch.Tensor): Scores with shape (N, ). + attrs (torch.Tensor, optional): Attributes with shape (N, ). Defaults to None. Returns: diff --git a/mmdet3d/core/evaluation/indoor_eval.py b/mmdet3d/core/evaluation/indoor_eval.py index 50d758655a..2ff9877329 100644 --- a/mmdet3d/core/evaluation/indoor_eval.py +++ b/mmdet3d/core/evaluation/indoor_eval.py @@ -9,9 +9,9 @@ def average_precision(recalls, precisions, mode='area'): """Calculate average precision (for single or multiple scales). Args: - recalls (np.ndarray): Recalls with shape of (num_scales, num_dets) \ + recalls (np.ndarray): Recalls with shape of (num_scales, num_dets) or (num_dets, ). - precisions (np.ndarray): Precisions with shape of \ + precisions (np.ndarray): Precisions with shape of (num_scales, num_dets) or (num_dets, ). mode (str): 'area' or '11points', 'area' means calculating the area under precision-recall curve, '11points' means calculating @@ -58,13 +58,13 @@ def eval_det_cls(pred, gt, iou_thr=None): single class. Args: - pred (dict): Predictions mapping from image id to bounding boxes \ + pred (dict): Predictions mapping from image id to bounding boxes and scores. gt (dict): Ground truths mapping from image id to bounding boxes. iou_thr (list[float]): A list of iou thresholds. Return: - tuple (np.ndarray, np.ndarray, float): Recalls, precisions and \ + tuple (np.ndarray, np.ndarray, float): Recalls, precisions and average precision. """ @@ -170,10 +170,9 @@ def eval_map_recall(pred, gt, ovthresh=None): Args: pred (dict): Information of detection results, which maps class_id and predictions. - gt (dict): Information of ground truths, which maps class_id and \ + gt (dict): Information of ground truths, which maps class_id and ground truths. - ovthresh (list[float]): iou threshold. - Default: None. + ovthresh (list[float], optional): iou threshold. Default: None. Return: tuple[dict]: dict results of recall, AP, and precision for all classes. @@ -218,12 +217,12 @@ def indoor_eval(gt_annos, includes the following keys - labels_3d (torch.Tensor): Labels of boxes. - - boxes_3d (:obj:`BaseInstance3DBoxes`): \ + - boxes_3d (:obj:`BaseInstance3DBoxes`): 3D bounding boxes in Depth coordinate. - scores_3d (torch.Tensor): Scores of boxes. metric (list[float]): IoU thresholds for computing average precisions. label2cat (dict): Map from label to category. - logger (logging.Logger | str | None): The way to print the mAP + logger (logging.Logger | str, optional): The way to print the mAP summary. See `mmdet.utils.print_log()` for details. Default: None. Return: diff --git a/mmdet3d/core/evaluation/kitti_utils/eval.py b/mmdet3d/core/evaluation/kitti_utils/eval.py index 93492c466c..f8408dfa61 100644 --- a/mmdet3d/core/evaluation/kitti_utils/eval.py +++ b/mmdet3d/core/evaluation/kitti_utils/eval.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import gc import io as sysio + import numba import numpy as np @@ -569,13 +570,20 @@ def eval_class(gt_annos, return ret_dict -def get_mAP(prec): +def get_mAP11(prec): sums = 0 for i in range(0, prec.shape[-1], 4): sums = sums + prec[..., i] return sums / 11 * 100 +def get_mAP40(prec): + sums = 0 + for i in range(1, prec.shape[-1]): + sums = sums + prec[..., i] + return sums / 40 * 100 + + def print_str(value, *arg, sstream=None): if sstream is None: sstream = sysio.StringIO() @@ -592,8 +600,10 @@ def do_eval(gt_annos, eval_types=['bbox', 'bev', '3d']): # min_overlaps: [num_minoverlap, metric, num_class] difficultys = [0, 1, 2] - mAP_bbox = None - mAP_aos = None + mAP11_bbox = None + mAP11_aos = None + mAP40_bbox = None + mAP40_aos = None if 'bbox' in eval_types: ret = eval_class( gt_annos, @@ -604,22 +614,29 @@ def do_eval(gt_annos, min_overlaps, compute_aos=('aos' in eval_types)) # ret: [num_class, num_diff, num_minoverlap, num_sample_points] - mAP_bbox = get_mAP(ret['precision']) + mAP11_bbox = get_mAP11(ret['precision']) + mAP40_bbox = get_mAP40(ret['precision']) if 'aos' in eval_types: - mAP_aos = get_mAP(ret['orientation']) + mAP11_aos = get_mAP11(ret['orientation']) + mAP40_aos = get_mAP40(ret['orientation']) - mAP_bev = None + mAP11_bev = None + mAP40_bev = None if 'bev' in eval_types: ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 1, min_overlaps) - mAP_bev = get_mAP(ret['precision']) + mAP11_bev = get_mAP11(ret['precision']) + mAP40_bev = get_mAP40(ret['precision']) - mAP_3d = None + mAP11_3d = None + mAP40_3d = None if '3d' in eval_types: ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 2, min_overlaps) - mAP_3d = get_mAP(ret['precision']) - return mAP_bbox, mAP_bev, mAP_3d, mAP_aos + mAP11_3d = get_mAP11(ret['precision']) + mAP40_3d = get_mAP40(ret['precision']) + return (mAP11_bbox, mAP11_bev, mAP11_3d, mAP11_aos, mAP40_bbox, mAP40_bev, + mAP40_3d, mAP40_aos) def do_coco_style_eval(gt_annos, dt_annos, current_classes, overlap_ranges, @@ -629,9 +646,10 @@ def do_coco_style_eval(gt_annos, dt_annos, current_classes, overlap_ranges, for i in range(overlap_ranges.shape[1]): for j in range(overlap_ranges.shape[2]): min_overlaps[:, i, j] = np.linspace(*overlap_ranges[:, i, j]) - mAP_bbox, mAP_bev, mAP_3d, mAP_aos = do_eval(gt_annos, dt_annos, - current_classes, min_overlaps, - compute_aos) + mAP_bbox, mAP_bev, mAP_3d, mAP_aos, _, _, \ + _, _ = do_eval(gt_annos, dt_annos, + current_classes, min_overlaps, + compute_aos) # ret: [num_class, num_diff, num_minoverlap] mAP_bbox = mAP_bbox.mean(-1) mAP_bev = mAP_bev.mean(-1) @@ -703,33 +721,109 @@ def kitti_eval(gt_annos, if compute_aos: eval_types.append('aos') - mAPbbox, mAPbev, mAP3d, mAPaos = do_eval(gt_annos, dt_annos, - current_classes, min_overlaps, - eval_types) + mAP11_bbox, mAP11_bev, mAP11_3d, mAP11_aos, mAP40_bbox, mAP40_bev, \ + mAP40_3d, mAP40_aos = do_eval(gt_annos, dt_annos, + current_classes, min_overlaps, + eval_types) ret_dict = {} difficulty = ['easy', 'moderate', 'hard'] + + # calculate AP11 + result += '\n----------- AP11 Results ------------\n\n' for j, curcls in enumerate(current_classes): # mAP threshold array: [num_minoverlap, metric, class] # mAP result: [num_class, num_diff, num_minoverlap] curcls_name = class_to_name[curcls] for i in range(min_overlaps.shape[0]): # prepare results for print - result += ('{} AP@{:.2f}, {:.2f}, {:.2f}:\n'.format( + result += ('{} AP11@{:.2f}, {:.2f}, {:.2f}:\n'.format( curcls_name, *min_overlaps[i, :, j])) - if mAPbbox is not None: - result += 'bbox AP:{:.4f}, {:.4f}, {:.4f}\n'.format( - *mAPbbox[j, :, i]) - if mAPbev is not None: - result += 'bev AP:{:.4f}, {:.4f}, {:.4f}\n'.format( - *mAPbev[j, :, i]) - if mAP3d is not None: - result += '3d AP:{:.4f}, {:.4f}, {:.4f}\n'.format( - *mAP3d[j, :, i]) + if mAP11_bbox is not None: + result += 'bbox AP11:{:.4f}, {:.4f}, {:.4f}\n'.format( + *mAP11_bbox[j, :, i]) + if mAP11_bev is not None: + result += 'bev AP11:{:.4f}, {:.4f}, {:.4f}\n'.format( + *mAP11_bev[j, :, i]) + if mAP11_3d is not None: + result += '3d AP11:{:.4f}, {:.4f}, {:.4f}\n'.format( + *mAP11_3d[j, :, i]) + if compute_aos: + result += 'aos AP11:{:.2f}, {:.2f}, {:.2f}\n'.format( + *mAP11_aos[j, :, i]) + # prepare results for logger + for idx in range(3): + if i == 0: + postfix = f'{difficulty[idx]}_strict' + else: + postfix = f'{difficulty[idx]}_loose' + prefix = f'KITTI/{curcls_name}' + if mAP11_3d is not None: + ret_dict[f'{prefix}_3D_AP11_{postfix}'] =\ + mAP11_3d[j, idx, i] + if mAP11_bev is not None: + ret_dict[f'{prefix}_BEV_AP11_{postfix}'] =\ + mAP11_bev[j, idx, i] + if mAP11_bbox is not None: + ret_dict[f'{prefix}_2D_AP11_{postfix}'] =\ + mAP11_bbox[j, idx, i] + + # calculate mAP11 over all classes if there are multiple classes + if len(current_classes) > 1: + # prepare results for print + result += ('\nOverall AP11@{}, {}, {}:\n'.format(*difficulty)) + if mAP11_bbox is not None: + mAP11_bbox = mAP11_bbox.mean(axis=0) + result += 'bbox AP11:{:.4f}, {:.4f}, {:.4f}\n'.format( + *mAP11_bbox[:, 0]) + if mAP11_bev is not None: + mAP11_bev = mAP11_bev.mean(axis=0) + result += 'bev AP11:{:.4f}, {:.4f}, {:.4f}\n'.format( + *mAP11_bev[:, 0]) + if mAP11_3d is not None: + mAP11_3d = mAP11_3d.mean(axis=0) + result += '3d AP11:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAP11_3d[:, + 0]) + if compute_aos: + mAP11_aos = mAP11_aos.mean(axis=0) + result += 'aos AP11:{:.2f}, {:.2f}, {:.2f}\n'.format( + *mAP11_aos[:, 0]) + + # prepare results for logger + for idx in range(3): + postfix = f'{difficulty[idx]}' + if mAP11_3d is not None: + ret_dict[f'KITTI/Overall_3D_AP11_{postfix}'] = mAP11_3d[idx, 0] + if mAP11_bev is not None: + ret_dict[f'KITTI/Overall_BEV_AP11_{postfix}'] =\ + mAP11_bev[idx, 0] + if mAP11_bbox is not None: + ret_dict[f'KITTI/Overall_2D_AP11_{postfix}'] =\ + mAP11_bbox[idx, 0] + + # Calculate AP40 + result += '\n----------- AP40 Results ------------\n\n' + for j, curcls in enumerate(current_classes): + # mAP threshold array: [num_minoverlap, metric, class] + # mAP result: [num_class, num_diff, num_minoverlap] + curcls_name = class_to_name[curcls] + for i in range(min_overlaps.shape[0]): + # prepare results for print + result += ('{} AP40@{:.2f}, {:.2f}, {:.2f}:\n'.format( + curcls_name, *min_overlaps[i, :, j])) + if mAP40_bbox is not None: + result += 'bbox AP40:{:.4f}, {:.4f}, {:.4f}\n'.format( + *mAP40_bbox[j, :, i]) + if mAP40_bev is not None: + result += 'bev AP40:{:.4f}, {:.4f}, {:.4f}\n'.format( + *mAP40_bev[j, :, i]) + if mAP40_3d is not None: + result += '3d AP40:{:.4f}, {:.4f}, {:.4f}\n'.format( + *mAP40_3d[j, :, i]) if compute_aos: - result += 'aos AP:{:.2f}, {:.2f}, {:.2f}\n'.format( - *mAPaos[j, :, i]) + result += 'aos AP40:{:.2f}, {:.2f}, {:.2f}\n'.format( + *mAP40_aos[j, :, i]) # prepare results for logger for idx in range(3): @@ -738,39 +832,48 @@ def kitti_eval(gt_annos, else: postfix = f'{difficulty[idx]}_loose' prefix = f'KITTI/{curcls_name}' - if mAP3d is not None: - ret_dict[f'{prefix}_3D_{postfix}'] = mAP3d[j, idx, i] - if mAPbev is not None: - ret_dict[f'{prefix}_BEV_{postfix}'] = mAPbev[j, idx, i] - if mAPbbox is not None: - ret_dict[f'{prefix}_2D_{postfix}'] = mAPbbox[j, idx, i] - - # calculate mAP over all classes if there are multiple classes + if mAP40_3d is not None: + ret_dict[f'{prefix}_3D_AP40_{postfix}'] =\ + mAP40_3d[j, idx, i] + if mAP40_bev is not None: + ret_dict[f'{prefix}_BEV_AP40_{postfix}'] =\ + mAP40_bev[j, idx, i] + if mAP40_bbox is not None: + ret_dict[f'{prefix}_2D_AP40_{postfix}'] =\ + mAP40_bbox[j, idx, i] + + # calculate mAP40 over all classes if there are multiple classes if len(current_classes) > 1: # prepare results for print - result += ('\nOverall AP@{}, {}, {}:\n'.format(*difficulty)) - if mAPbbox is not None: - mAPbbox = mAPbbox.mean(axis=0) - result += 'bbox AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAPbbox[:, 0]) - if mAPbev is not None: - mAPbev = mAPbev.mean(axis=0) - result += 'bev AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAPbev[:, 0]) - if mAP3d is not None: - mAP3d = mAP3d.mean(axis=0) - result += '3d AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAP3d[:, 0]) + result += ('\nOverall AP40@{}, {}, {}:\n'.format(*difficulty)) + if mAP40_bbox is not None: + mAP40_bbox = mAP40_bbox.mean(axis=0) + result += 'bbox AP40:{:.4f}, {:.4f}, {:.4f}\n'.format( + *mAP40_bbox[:, 0]) + if mAP40_bev is not None: + mAP40_bev = mAP40_bev.mean(axis=0) + result += 'bev AP40:{:.4f}, {:.4f}, {:.4f}\n'.format( + *mAP40_bev[:, 0]) + if mAP40_3d is not None: + mAP40_3d = mAP40_3d.mean(axis=0) + result += '3d AP40:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAP40_3d[:, + 0]) if compute_aos: - mAPaos = mAPaos.mean(axis=0) - result += 'aos AP:{:.2f}, {:.2f}, {:.2f}\n'.format(*mAPaos[:, 0]) + mAP40_aos = mAP40_aos.mean(axis=0) + result += 'aos AP40:{:.2f}, {:.2f}, {:.2f}\n'.format( + *mAP40_aos[:, 0]) # prepare results for logger for idx in range(3): postfix = f'{difficulty[idx]}' - if mAP3d is not None: - ret_dict[f'KITTI/Overall_3D_{postfix}'] = mAP3d[idx, 0] - if mAPbev is not None: - ret_dict[f'KITTI/Overall_BEV_{postfix}'] = mAPbev[idx, 0] - if mAPbbox is not None: - ret_dict[f'KITTI/Overall_2D_{postfix}'] = mAPbbox[idx, 0] + if mAP40_3d is not None: + ret_dict[f'KITTI/Overall_3D_AP40_{postfix}'] = mAP40_3d[idx, 0] + if mAP40_bev is not None: + ret_dict[f'KITTI/Overall_BEV_AP40_{postfix}'] =\ + mAP40_bev[idx, 0] + if mAP40_bbox is not None: + ret_dict[f'KITTI/Overall_2D_AP40_{postfix}'] =\ + mAP40_bbox[idx, 0] return result, ret_dict diff --git a/mmdet3d/core/evaluation/kitti_utils/rotate_iou.py b/mmdet3d/core/evaluation/kitti_utils/rotate_iou.py index 2f0c9c8e50..9ed75bf08d 100644 --- a/mmdet3d/core/evaluation/kitti_utils/rotate_iou.py +++ b/mmdet3d/core/evaluation/kitti_utils/rotate_iou.py @@ -5,6 +5,7 @@ # Author: yanyan, scrin@foxmail.com ##################### import math + import numba import numpy as np from numba import cuda @@ -15,13 +16,13 @@ def div_up(m, n): return m // n + (m % n > 0) -@cuda.jit('(float32[:], float32[:], float32[:])', device=True, inline=True) +@cuda.jit(device=True, inline=True) def trangle_area(a, b, c): return ((a[0] - c[0]) * (b[1] - c[1]) - (a[1] - c[1]) * (b[0] - c[0])) / 2.0 -@cuda.jit('(float32[:], int32)', device=True, inline=True) +@cuda.jit(device=True, inline=True) def area(int_pts, num_of_inter): area_val = 0.0 for i in range(num_of_inter - 2): @@ -31,7 +32,7 @@ def area(int_pts, num_of_inter): return area_val -@cuda.jit('(float32[:], int32)', device=True, inline=True) +@cuda.jit(device=True, inline=True) def sort_vertex_in_convex_polygon(int_pts, num_of_inter): if num_of_inter > 0: center = cuda.local.array((2, ), dtype=numba.float32) @@ -71,10 +72,7 @@ def sort_vertex_in_convex_polygon(int_pts, num_of_inter): int_pts[j * 2 + 1] = ty -@cuda.jit( - '(float32[:], float32[:], int32, int32, float32[:])', - device=True, - inline=True) +@cuda.jit(device=True, inline=True) def line_segment_intersection(pts1, pts2, i, j, temp_pts): A = cuda.local.array((2, ), dtype=numba.float32) B = cuda.local.array((2, ), dtype=numba.float32) @@ -117,10 +115,7 @@ def line_segment_intersection(pts1, pts2, i, j, temp_pts): return False -@cuda.jit( - '(float32[:], float32[:], int32, int32, float32[:])', - device=True, - inline=True) +@cuda.jit(device=True, inline=True) def line_segment_intersection_v1(pts1, pts2, i, j, temp_pts): a = cuda.local.array((2, ), dtype=numba.float32) b = cuda.local.array((2, ), dtype=numba.float32) @@ -159,7 +154,7 @@ def line_segment_intersection_v1(pts1, pts2, i, j, temp_pts): return True -@cuda.jit('(float32, float32, float32[:])', device=True, inline=True) +@cuda.jit(device=True, inline=True) def point_in_quadrilateral(pt_x, pt_y, corners): ab0 = corners[2] - corners[0] ab1 = corners[3] - corners[1] @@ -178,7 +173,7 @@ def point_in_quadrilateral(pt_x, pt_y, corners): return abab >= abap and abap >= 0 and adad >= adap and adap >= 0 -@cuda.jit('(float32[:], float32[:], float32[:])', device=True, inline=True) +@cuda.jit(device=True, inline=True) def quadrilateral_intersection(pts1, pts2, int_pts): num_of_inter = 0 for i in range(4): @@ -202,7 +197,7 @@ def quadrilateral_intersection(pts1, pts2, int_pts): return num_of_inter -@cuda.jit('(float32[:], float32[:])', device=True, inline=True) +@cuda.jit(device=True, inline=True) def rbbox_to_corners(corners, rbbox): # generate clockwise corners and rotate it clockwise angle = rbbox[4] @@ -228,7 +223,7 @@ def rbbox_to_corners(corners, rbbox): 1] = -a_sin * corners_x[i] + a_cos * corners_y[i] + center_y -@cuda.jit('(float32[:], float32[:])', device=True, inline=True) +@cuda.jit(device=True, inline=True) def inter(rbbox1, rbbox2): """Compute intersection of two rotated boxes. @@ -254,7 +249,7 @@ def inter(rbbox1, rbbox2): return area(intersection_corners, num_intersection) -@cuda.jit('(float32[:], float32[:], int32)', device=True, inline=True) +@cuda.jit(device=True, inline=True) def devRotateIoUEval(rbox1, rbox2, criterion=-1): """Compute rotated iou on device. @@ -291,7 +286,8 @@ def rotate_iou_kernel_eval(N, dev_query_boxes, dev_iou, criterion=-1): - """Kernel of computing rotated iou. + """Kernel of computing rotated IoU. This function is for bev boxes in + camera coordinate system ONLY (the rotation is clockwise). Args: N (int): The number of boxes. @@ -343,10 +339,14 @@ def rotate_iou_gpu_eval(boxes, query_boxes, criterion=-1, device_id=0): in one example with numba.cuda code). convert from [this project]( https://github.com/hongzhenwang/RRPN-revise/tree/master/lib/rotation). + This function is for bev boxes in camera coordinate system ONLY + (the rotation is clockwise). + Args: boxes (torch.Tensor): rbboxes. format: centers, dims, angles(clockwise when positive) with the shape of [N, 5]. - query_boxes (float tensor: [K, 5]): rbboxes to compute iou with boxes. + query_boxes (torch.FloatTensor, shape=(K, 5)): + rbboxes to compute iou with boxes. device_id (int, optional): Defaults to 0. Device to use. criterion (int, optional): Indicate different type of iou. -1 indicate `area_inter / (area1 + area2 - area_inter)`, diff --git a/mmdet3d/core/evaluation/lyft_eval.py b/mmdet3d/core/evaluation/lyft_eval.py index b6aa775ef9..47c5cd6a60 100644 --- a/mmdet3d/core/evaluation/lyft_eval.py +++ b/mmdet3d/core/evaluation/lyft_eval.py @@ -1,4 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. +from os import path as osp + import mmcv import numpy as np from lyft_dataset_sdk.eval.detection.mAP_evaluation import (Box3D, get_ap, @@ -7,7 +9,6 @@ group_by_key, wrap_in_box) from mmcv.utils import print_log -from os import path as osp from terminaltables import AsciiTable @@ -18,7 +19,7 @@ def load_lyft_gts(lyft, data_root, eval_split, logger=None): lyft (:obj:`LyftDataset`): Lyft class in the sdk. data_root (str): Root of data for reading splits. eval_split (str): Name of the split for evaluation. - logger (logging.Logger | str | None): Logger used for printing + logger (logging.Logger | str, optional): Logger used for printing related information during evaluation. Default: None. Returns: @@ -96,7 +97,7 @@ def lyft_eval(lyft, data_root, res_path, eval_set, output_dir, logger=None): res_path (str): Path of result json file recording detections. eval_set (str): Name of the split for evaluation. output_dir (str): Output directory for output json files. - logger (logging.Logger | str | None): Logger used for printing + logger (logging.Logger | str, optional): Logger used for printing related information during evaluation. Default: None. Returns: @@ -202,9 +203,9 @@ def get_single_class_aps(gt, predictions, iou_thresholds): Args: gt (list[dict]): list of dictionaries in the format described above. - predictions (list[dict]): list of dictionaries in the format \ + predictions (list[dict]): list of dictionaries in the format described below. - iou_thresholds (list[float]): IOU thresholds used to calculate \ + iou_thresholds (list[float]): IOU thresholds used to calculate TP / FN Returns: diff --git a/mmdet3d/core/evaluation/seg_eval.py b/mmdet3d/core/evaluation/seg_eval.py index ab26de0241..4a3166d685 100644 --- a/mmdet3d/core/evaluation/seg_eval.py +++ b/mmdet3d/core/evaluation/seg_eval.py @@ -77,7 +77,7 @@ def seg_eval(gt_labels, seg_preds, label2cat, ignore_index, logger=None): seg_preds (list[torch.Tensor]): Predictions. label2cat (dict): Map from label to category name. ignore_index (int): Index that will be ignored in evaluation. - logger (logging.Logger | str | None): The way to print the mAP + logger (logging.Logger | str, optional): The way to print the mAP summary. See `mmdet.utils.print_log()` for details. Default: None. Returns: diff --git a/mmdet3d/core/evaluation/waymo_utils/prediction_kitti_to_waymo.py b/mmdet3d/core/evaluation/waymo_utils/prediction_kitti_to_waymo.py index 72a3883a8a..205c24cbc0 100644 --- a/mmdet3d/core/evaluation/waymo_utils/prediction_kitti_to_waymo.py +++ b/mmdet3d/core/evaluation/waymo_utils/prediction_kitti_to_waymo.py @@ -10,11 +10,12 @@ 'Please run "pip install waymo-open-dataset-tf-2-1-0==1.2.0" ' 'to install the official devkit first.') +from glob import glob +from os.path import join + import mmcv import numpy as np import tensorflow as tf -from glob import glob -from os.path import join from waymo_open_dataset import label_pb2 from waymo_open_dataset.protos import metrics_pb2 @@ -114,7 +115,7 @@ def parse_one_object(instance_idx): instance_idx (int): Index of the instance to be converted. Returns: - :obj:`Object`: Predicted instance in waymo dataset \ + :obj:`Object`: Predicted instance in waymo dataset Object proto. """ cls = kitti_result['name'][instance_idx] diff --git a/mmdet3d/core/points/base_points.py b/mmdet3d/core/points/base_points.py index b40e68431f..1f03ce3be8 100644 --- a/mmdet3d/core/points/base_points.py +++ b/mmdet3d/core/points/base_points.py @@ -1,26 +1,29 @@ # Copyright (c) OpenMMLab. All rights reserved. -import numpy as np -import torch import warnings from abc import abstractmethod +import numpy as np +import torch + +from ..bbox.structures.utils import rotation_3d_in_axis + class BasePoints(object): """Base class for Points. Args: tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix. - points_dim (int): Number of the dimension of a point. - Each row is (x, y, z). Default to 3. - attribute_dims (dict): Dictionary to indicate the meaning of extra - dimension. Default to None. + points_dim (int, optional): Number of the dimension of a point. + Each row is (x, y, z). Defaults to 3. + attribute_dims (dict, optional): Dictionary to indicate the + meaning of extra dimension. Defaults to None. Attributes: tensor (torch.Tensor): Float matrix of N x points_dim. points_dim (int): Integer indicating the dimension of a point. Each row is (x, y, z, ...). attribute_dims (bool): Dictionary to indicate the meaning of extra - dimension. Default to None. + dimension. Defaults to None. rotation_axis (int): Default rotation axis for points rotation. """ @@ -45,7 +48,7 @@ def __init__(self, tensor, points_dim=3, attribute_dims=None): @property def coord(self): - """torch.Tensor: Coordinates of each point with size (N, 3).""" + """torch.Tensor: Coordinates of each point in shape (N, 3).""" return self.tensor[:, :3] @coord.setter @@ -61,7 +64,8 @@ def coord(self, tensor): @property def height(self): - """torch.Tensor: A vector with height of each point.""" + """torch.Tensor: + A vector with height of each point in shape (N, 1), or None.""" if self.attribute_dims is not None and \ 'height' in self.attribute_dims.keys(): return self.tensor[:, self.attribute_dims['height']] @@ -91,7 +95,8 @@ def height(self, tensor): @property def color(self): - """torch.Tensor: A vector with color of each point.""" + """torch.Tensor: + A vector with color of each point in shape (N, 3), or None.""" if self.attribute_dims is not None and \ 'color' in self.attribute_dims.keys(): return self.tensor[:, self.attribute_dims['color']] @@ -141,9 +146,9 @@ def rotate(self, rotation, axis=None): """Rotate points with the given rotation matrix or angle. Args: - rotation (float, np.ndarray, torch.Tensor): Rotation matrix + rotation (float | np.ndarray | torch.Tensor): Rotation matrix or angle. - axis (int): Axis to rotate at. Defaults to None. + axis (int, optional): Axis to rotate at. Defaults to None. """ if not isinstance(rotation, torch.Tensor): rotation = self.tensor.new_tensor(rotation) @@ -154,34 +159,24 @@ def rotate(self, rotation, axis=None): axis = self.rotation_axis if rotation.numel() == 1: - rot_sin = torch.sin(rotation) - rot_cos = torch.cos(rotation) - if axis == 1: - rot_mat_T = rotation.new_tensor([[rot_cos, 0, -rot_sin], - [0, 1, 0], - [rot_sin, 0, rot_cos]]) - elif axis == 2 or axis == -1: - rot_mat_T = rotation.new_tensor([[rot_cos, -rot_sin, 0], - [rot_sin, rot_cos, 0], - [0, 0, 1]]) - elif axis == 0: - rot_mat_T = rotation.new_tensor([[1, 0, 0], - [0, rot_cos, -rot_sin], - [0, rot_sin, rot_cos]]) - else: - raise ValueError('axis should in range') - rot_mat_T = rot_mat_T.T - elif rotation.numel() == 9: - rot_mat_T = rotation + rotated_points, rot_mat_T = rotation_3d_in_axis( + self.tensor[:, :3][None], rotation, axis=axis, return_mat=True) + self.tensor[:, :3] = rotated_points.squeeze(0) + rot_mat_T = rot_mat_T.squeeze(0) else: - raise NotImplementedError - self.tensor[:, :3] = self.tensor[:, :3] @ rot_mat_T + # rotation.numel() == 9 + self.tensor[:, :3] = self.tensor[:, :3] @ rotation + rot_mat_T = rotation return rot_mat_T @abstractmethod def flip(self, bev_direction='horizontal'): - """Flip the points in BEV along given BEV direction.""" + """Flip the points along given BEV direction. + + Args: + bev_direction (str): Flip direction (horizontal or vertical). + """ pass def translate(self, trans_vector): @@ -218,7 +213,7 @@ def in_range_3d(self, point_range): polygon, we try to reduce the burden for simpler cases. Returns: - torch.Tensor: A binary vector indicating whether each point is \ + torch.Tensor: A binary vector indicating whether each point is inside the reference range. """ in_range_flags = ((self.tensor[:, 0] > point_range[0]) @@ -229,7 +224,11 @@ def in_range_3d(self, point_range): & (self.tensor[:, 2] < point_range[5])) return in_range_flags - @abstractmethod + @property + def bev(self): + """torch.Tensor: BEV of the points in shape (N, 2).""" + return self.tensor[:, [0, 1]] + def in_range_bev(self, point_range): """Check whether the points are in the given range. @@ -238,10 +237,14 @@ def in_range_bev(self, point_range): in order of (x_min, y_min, x_max, y_max). Returns: - torch.Tensor: Indicating whether each point is inside \ + torch.Tensor: Indicating whether each point is inside the reference range. """ - pass + in_range_flags = ((self.bev[:, 0] > point_range[0]) + & (self.bev[:, 1] > point_range[1]) + & (self.bev[:, 1] < point_range[2]) + & (self.bev[:, 1] < point_range[3])) + return in_range_flags @abstractmethod def convert_to(self, dst, rt_mat=None): @@ -249,14 +252,15 @@ def convert_to(self, dst, rt_mat=None): Args: dst (:obj:`CoordMode`): The target Box mode. - rt_mat (np.ndarray | torch.Tensor): The rotation and translation - matrix between different coordinates. Defaults to None. + rt_mat (np.ndarray | torch.Tensor, optional): The rotation and + translation matrix between different coordinates. + Defaults to None. The conversion from `src` coordinates to `dst` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: - :obj:`BasePoints`: The converted box of the same type \ + :obj:`BasePoints`: The converted box of the same type in the `dst` mode. """ pass @@ -288,7 +292,7 @@ def __getitem__(self, item): subject to Pytorch's indexing semantics. Returns: - :obj:`BasePoints`: A new object of \ + :obj:`BasePoints`: A new object of :class:`BasePoints` after indexing. """ original_type = type(self) @@ -379,7 +383,7 @@ def to(self, device): device (str | :obj:`torch.device`): The name of the device. Returns: - :obj:`BasePoints`: A new boxes object on the \ + :obj:`BasePoints`: A new boxes object on the specific device. """ original_type = type(self) @@ -392,7 +396,7 @@ def clone(self): """Clone the Points. Returns: - :obj:`BasePoints`: Box object with the same properties \ + :obj:`BasePoints`: Box object with the same properties as self. """ original_type = type(self) @@ -417,14 +421,14 @@ def __iter__(self): def new_point(self, data): """Create a new point object with data. - The new point and its tensor has the similar properties \ + The new point and its tensor has the similar properties as self and self.tensor, respectively. Args: data (torch.Tensor | numpy.array | list): Data to be copied. Returns: - :obj:`BasePoints`: A new point object with ``data``, \ + :obj:`BasePoints`: A new point object with ``data``, the object's other properties are similar to ``self``. """ new_tensor = self.tensor.new_tensor(data) \ diff --git a/mmdet3d/core/points/cam_points.py b/mmdet3d/core/points/cam_points.py index cb866ddcfb..a57c3db1e8 100644 --- a/mmdet3d/core/points/cam_points.py +++ b/mmdet3d/core/points/cam_points.py @@ -7,17 +7,17 @@ class CameraPoints(BasePoints): Args: tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix. - points_dim (int): Number of the dimension of a point. - Each row is (x, y, z). Default to 3. - attribute_dims (dict): Dictionary to indicate the meaning of extra - dimension. Default to None. + points_dim (int, optional): Number of the dimension of a point. + Each row is (x, y, z). Defaults to 3. + attribute_dims (dict, optional): Dictionary to indicate the + meaning of extra dimension. Defaults to None. Attributes: tensor (torch.Tensor): Float matrix of N x points_dim. points_dim (int): Integer indicating the dimension of a point. Each row is (x, y, z, ...). attribute_dims (bool): Dictionary to indicate the meaning of extra - dimension. Default to None. + dimension. Defaults to None. rotation_axis (int): Default rotation axis for points rotation. """ @@ -27,42 +27,35 @@ def __init__(self, tensor, points_dim=3, attribute_dims=None): self.rotation_axis = 1 def flip(self, bev_direction='horizontal'): - """Flip the boxes in BEV along given BEV direction.""" + """Flip the points along given BEV direction. + + Args: + bev_direction (str): Flip direction (horizontal or vertical). + """ if bev_direction == 'horizontal': self.tensor[:, 0] = -self.tensor[:, 0] elif bev_direction == 'vertical': self.tensor[:, 2] = -self.tensor[:, 2] - def in_range_bev(self, point_range): - """Check whether the points are in the given range. - - Args: - point_range (list | torch.Tensor): The range of point - in order of (x_min, y_min, x_max, y_max). - - Returns: - torch.Tensor: Indicating whether each point is inside \ - the reference range. - """ - in_range_flags = ((self.tensor[:, 0] > point_range[0]) - & (self.tensor[:, 2] > point_range[1]) - & (self.tensor[:, 0] < point_range[2]) - & (self.tensor[:, 2] < point_range[3])) - return in_range_flags + @property + def bev(self): + """torch.Tensor: BEV of the points in shape (N, 2).""" + return self.tensor[:, [0, 2]] def convert_to(self, dst, rt_mat=None): """Convert self to ``dst`` mode. Args: dst (:obj:`CoordMode`): The target Point mode. - rt_mat (np.ndarray | torch.Tensor): The rotation and translation - matrix between different coordinates. Defaults to None. + rt_mat (np.ndarray | torch.Tensor, optional): The rotation and + translation matrix between different coordinates. + Defaults to None. The conversion from `src` coordinates to `dst` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: - :obj:`BasePoints`: The converted point of the same type \ + :obj:`BasePoints`: The converted point of the same type in the `dst` mode. """ from mmdet3d.core.bbox import Coord3DMode diff --git a/mmdet3d/core/points/depth_points.py b/mmdet3d/core/points/depth_points.py index 3dfd5f7cd8..2d9221fb25 100644 --- a/mmdet3d/core/points/depth_points.py +++ b/mmdet3d/core/points/depth_points.py @@ -7,17 +7,17 @@ class DepthPoints(BasePoints): Args: tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix. - points_dim (int): Number of the dimension of a point. - Each row is (x, y, z). Default to 3. - attribute_dims (dict): Dictionary to indicate the meaning of extra - dimension. Default to None. + points_dim (int, optional): Number of the dimension of a point. + Each row is (x, y, z). Defaults to 3. + attribute_dims (dict, optional): Dictionary to indicate the + meaning of extra dimension. Defaults to None. Attributes: tensor (torch.Tensor): Float matrix of N x points_dim. points_dim (int): Integer indicating the dimension of a point. Each row is (x, y, z, ...). attribute_dims (bool): Dictionary to indicate the meaning of extra - dimension. Default to None. + dimension. Defaults to None. rotation_axis (int): Default rotation axis for points rotation. """ @@ -27,42 +27,30 @@ def __init__(self, tensor, points_dim=3, attribute_dims=None): self.rotation_axis = 2 def flip(self, bev_direction='horizontal'): - """Flip the boxes in BEV along given BEV direction.""" + """Flip the points along given BEV direction. + + Args: + bev_direction (str): Flip direction (horizontal or vertical). + """ if bev_direction == 'horizontal': self.tensor[:, 0] = -self.tensor[:, 0] elif bev_direction == 'vertical': self.tensor[:, 1] = -self.tensor[:, 1] - def in_range_bev(self, point_range): - """Check whether the points are in the given range. - - Args: - point_range (list | torch.Tensor): The range of point - in order of (x_min, y_min, x_max, y_max). - - Returns: - torch.Tensor: Indicating whether each point is inside \ - the reference range. - """ - in_range_flags = ((self.tensor[:, 0] > point_range[0]) - & (self.tensor[:, 1] > point_range[1]) - & (self.tensor[:, 0] < point_range[2]) - & (self.tensor[:, 1] < point_range[3])) - return in_range_flags - def convert_to(self, dst, rt_mat=None): """Convert self to ``dst`` mode. Args: dst (:obj:`CoordMode`): The target Point mode. - rt_mat (np.ndarray | torch.Tensor): The rotation and translation - matrix between different coordinates. Defaults to None. + rt_mat (np.ndarray | torch.Tensor, optional): The rotation and + translation matrix between different coordinates. + Defaults to None. The conversion from `src` coordinates to `dst` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: - :obj:`BasePoints`: The converted point of the same type \ + :obj:`BasePoints`: The converted point of the same type in the `dst` mode. """ from mmdet3d.core.bbox import Coord3DMode diff --git a/mmdet3d/core/points/lidar_points.py b/mmdet3d/core/points/lidar_points.py index ec0c98e273..ff4f57ab0e 100644 --- a/mmdet3d/core/points/lidar_points.py +++ b/mmdet3d/core/points/lidar_points.py @@ -7,17 +7,17 @@ class LiDARPoints(BasePoints): Args: tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix. - points_dim (int): Number of the dimension of a point. - Each row is (x, y, z). Default to 3. - attribute_dims (dict): Dictionary to indicate the meaning of extra - dimension. Default to None. + points_dim (int, optional): Number of the dimension of a point. + Each row is (x, y, z). Defaults to 3. + attribute_dims (dict, optional): Dictionary to indicate the + meaning of extra dimension. Defaults to None. Attributes: tensor (torch.Tensor): Float matrix of N x points_dim. points_dim (int): Integer indicating the dimension of a point. Each row is (x, y, z, ...). attribute_dims (bool): Dictionary to indicate the meaning of extra - dimension. Default to None. + dimension. Defaults to None. rotation_axis (int): Default rotation axis for points rotation. """ @@ -27,42 +27,30 @@ def __init__(self, tensor, points_dim=3, attribute_dims=None): self.rotation_axis = 2 def flip(self, bev_direction='horizontal'): - """Flip the boxes in BEV along given BEV direction.""" + """Flip the points along given BEV direction. + + Args: + bev_direction (str): Flip direction (horizontal or vertical). + """ if bev_direction == 'horizontal': self.tensor[:, 1] = -self.tensor[:, 1] elif bev_direction == 'vertical': self.tensor[:, 0] = -self.tensor[:, 0] - def in_range_bev(self, point_range): - """Check whether the points are in the given range. - - Args: - point_range (list | torch.Tensor): The range of point - in order of (x_min, y_min, x_max, y_max). - - Returns: - torch.Tensor: Indicating whether each point is inside \ - the reference range. - """ - in_range_flags = ((self.tensor[:, 0] > point_range[0]) - & (self.tensor[:, 1] > point_range[1]) - & (self.tensor[:, 0] < point_range[2]) - & (self.tensor[:, 1] < point_range[3])) - return in_range_flags - def convert_to(self, dst, rt_mat=None): """Convert self to ``dst`` mode. Args: dst (:obj:`CoordMode`): The target Point mode. - rt_mat (np.ndarray | torch.Tensor): The rotation and translation - matrix between different coordinates. Defaults to None. + rt_mat (np.ndarray | torch.Tensor, optional): The rotation and + translation matrix between different coordinates. + Defaults to None. The conversion from `src` coordinates to `dst` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: - :obj:`BasePoints`: The converted point of the same type \ + :obj:`BasePoints`: The converted point of the same type in the `dst` mode. """ from mmdet3d.core.bbox import Coord3DMode diff --git a/mmdet3d/core/post_processing/box3d_nms.py b/mmdet3d/core/post_processing/box3d_nms.py index 638c9aaae0..a8d6521d6f 100644 --- a/mmdet3d/core/post_processing/box3d_nms.py +++ b/mmdet3d/core/post_processing/box3d_nms.py @@ -15,16 +15,18 @@ def box3d_multiclass_nms(mlvl_bboxes, mlvl_dir_scores=None, mlvl_attr_scores=None, mlvl_bboxes2d=None): - """Multi-class nms for 3D boxes. + """Multi-class NMS for 3D boxes. The IoU used for NMS is defined as the 2D + IoU between BEV boxes. Args: mlvl_bboxes (torch.Tensor): Multi-level boxes with shape (N, M). M is the dimensions of boxes. mlvl_bboxes_for_nms (torch.Tensor): Multi-level boxes with shape (N, 5) ([x1, y1, x2, y2, ry]). N is the number of boxes. + The coordinate system of the BEV boxes is counterclockwise. mlvl_scores (torch.Tensor): Multi-level boxes with shape (N, C + 1). N is the number of boxes. C is the number of classes. - score_thr (float): Score thredhold to filter boxes with low + score_thr (float): Score threshold to filter boxes with low confidence. max_num (int): Maximum number of boxes will be kept. cfg (dict): Configuration dict of NMS. @@ -36,8 +38,8 @@ def box3d_multiclass_nms(mlvl_bboxes, boxes. Defaults to None. Returns: - tuple[torch.Tensor]: Return results after nms, including 3D \ - bounding boxes, scores, labels, direction scores, attribute \ + tuple[torch.Tensor]: Return results after nms, including 3D + bounding boxes, scores, labels, direction scores, attribute scores (optional) and 2D bounding boxes (optional). """ # do multi class nms @@ -128,13 +130,13 @@ def box3d_multiclass_nms(mlvl_bboxes, def aligned_3d_nms(boxes, scores, classes, thresh): - """3d nms for aligned boxes. + """3D NMS for aligned boxes. Args: boxes (torch.Tensor): Aligned box with shape [n, 6]. scores (torch.Tensor): Scores of each box. classes (torch.Tensor): Class of each box. - thresh (float): Iou threshold for nms. + thresh (float): IoU threshold for nms. Returns: torch.Tensor: Indices of selected boxes. @@ -188,8 +190,8 @@ def circle_nms(dets, thresh, post_max_size=83): Args: dets (torch.Tensor): Detection results with the shape of [N, 3]. thresh (float): Value of threshold. - post_max_size (int): Max number of prediction to be kept. Defaults - to 83 + post_max_size (int, optional): Max number of prediction to be kept. + Defaults to 83. Returns: torch.Tensor: Indexes of the detections to be kept. @@ -217,4 +219,8 @@ def circle_nms(dets, thresh, post_max_size=83): # ovr = inter / areas[j] if dist <= thresh: suppressed[j] = 1 - return keep[:post_max_size] + + if post_max_size < len(keep): + return keep[:post_max_size] + + return keep diff --git a/mmdet3d/core/utils/__init__.py b/mmdet3d/core/utils/__init__.py index 69cbb69d8c..b2a8deca2b 100644 --- a/mmdet3d/core/utils/__init__.py +++ b/mmdet3d/core/utils/__init__.py @@ -1,4 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. -from .gaussian import draw_heatmap_gaussian, gaussian_2d, gaussian_radius +from .array_converter import ArrayConverter, array_converter +from .gaussian import (draw_heatmap_gaussian, ellip_gaussian2D, gaussian_2d, + gaussian_radius, get_ellip_gaussian_2D) -__all__ = ['gaussian_2d', 'gaussian_radius', 'draw_heatmap_gaussian'] +__all__ = [ + 'gaussian_2d', 'gaussian_radius', 'draw_heatmap_gaussian', + 'ArrayConverter', 'array_converter', 'ellip_gaussian2D', + 'get_ellip_gaussian_2D' +] diff --git a/mmdet3d/core/utils/array_converter.py b/mmdet3d/core/utils/array_converter.py new file mode 100644 index 0000000000..a555aa601d --- /dev/null +++ b/mmdet3d/core/utils/array_converter.py @@ -0,0 +1,324 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import functools +from inspect import getfullargspec + +import numpy as np +import torch + + +def array_converter(to_torch=True, + apply_to=tuple(), + template_arg_name_=None, + recover=True): + """Wrapper function for data-type agnostic processing. + + First converts input arrays to PyTorch tensors or NumPy ndarrays + for middle calculation, then convert output to original data-type if + `recover=True`. + + Args: + to_torch (Bool, optional): Whether convert to PyTorch tensors + for middle calculation. Defaults to True. + apply_to (tuple[str], optional): The arguments to which we apply + data-type conversion. Defaults to an empty tuple. + template_arg_name_ (str, optional): Argument serving as the template ( + return arrays should have the same dtype and device + as the template). Defaults to None. If None, we will use the + first argument in `apply_to` as the template argument. + recover (Bool, optional): Whether or not recover the wrapped function + outputs to the `template_arg_name_` type. Defaults to True. + + Raises: + ValueError: When template_arg_name_ is not among all args, or + when apply_to contains an arg which is not among all args, + a ValueError will be raised. When the template argument or + an argument to convert is a list or tuple, and cannot be + converted to a NumPy array, a ValueError will be raised. + TypeError: When the type of the template argument or + an argument to convert does not belong to the above range, + or the contents of such an list-or-tuple-type argument + do not share the same data type, a TypeError is raised. + + Returns: + (function): wrapped function. + + Example: + >>> import torch + >>> import numpy as np + >>> + >>> # Use torch addition for a + b, + >>> # and convert return values to the type of a + >>> @array_converter(apply_to=('a', 'b')) + >>> def simple_add(a, b): + >>> return a + b + >>> + >>> a = np.array([1.1]) + >>> b = np.array([2.2]) + >>> simple_add(a, b) + >>> + >>> # Use numpy addition for a + b, + >>> # and convert return values to the type of b + >>> @array_converter(to_torch=False, apply_to=('a', 'b'), + >>> template_arg_name_='b') + >>> def simple_add(a, b): + >>> return a + b + >>> + >>> simple_add() + >>> + >>> # Use torch funcs for floor(a) if flag=True else ceil(a), + >>> # and return the torch tensor + >>> @array_converter(apply_to=('a',), recover=False) + >>> def floor_or_ceil(a, flag=True): + >>> return torch.floor(a) if flag else torch.ceil(a) + >>> + >>> floor_or_ceil(a, flag=False) + """ + + def array_converter_wrapper(func): + """Outer wrapper for the function.""" + + @functools.wraps(func) + def new_func(*args, **kwargs): + """Inner wrapper for the arguments.""" + if len(apply_to) == 0: + return func(*args, **kwargs) + + func_name = func.__name__ + + arg_spec = getfullargspec(func) + + arg_names = arg_spec.args + arg_num = len(arg_names) + default_arg_values = arg_spec.defaults + if default_arg_values is None: + default_arg_values = [] + no_default_arg_num = len(arg_names) - len(default_arg_values) + + kwonly_arg_names = arg_spec.kwonlyargs + kwonly_default_arg_values = arg_spec.kwonlydefaults + if kwonly_default_arg_values is None: + kwonly_default_arg_values = {} + + all_arg_names = arg_names + kwonly_arg_names + + # in case there are args in the form of *args + if len(args) > arg_num: + named_args = args[:arg_num] + nameless_args = args[arg_num:] + else: + named_args = args + nameless_args = [] + + # template argument data type is used for all array-like arguments + if template_arg_name_ is None: + template_arg_name = apply_to[0] + else: + template_arg_name = template_arg_name_ + + if template_arg_name not in all_arg_names: + raise ValueError(f'{template_arg_name} is not among the ' + f'argument list of function {func_name}') + + # inspect apply_to + for arg_to_apply in apply_to: + if arg_to_apply not in all_arg_names: + raise ValueError(f'{arg_to_apply} is not ' + f'an argument of {func_name}') + + new_args = [] + new_kwargs = {} + + converter = ArrayConverter() + target_type = torch.Tensor if to_torch else np.ndarray + + # non-keyword arguments + for i, arg_value in enumerate(named_args): + if arg_names[i] in apply_to: + new_args.append( + converter.convert( + input_array=arg_value, target_type=target_type)) + else: + new_args.append(arg_value) + + if arg_names[i] == template_arg_name: + template_arg_value = arg_value + + kwonly_default_arg_values.update(kwargs) + kwargs = kwonly_default_arg_values + + # keyword arguments and non-keyword arguments using default value + for i in range(len(named_args), len(all_arg_names)): + arg_name = all_arg_names[i] + if arg_name in kwargs: + if arg_name in apply_to: + new_kwargs[arg_name] = converter.convert( + input_array=kwargs[arg_name], + target_type=target_type) + else: + new_kwargs[arg_name] = kwargs[arg_name] + else: + default_value = default_arg_values[i - no_default_arg_num] + if arg_name in apply_to: + new_kwargs[arg_name] = converter.convert( + input_array=default_value, target_type=target_type) + else: + new_kwargs[arg_name] = default_value + if arg_name == template_arg_name: + template_arg_value = kwargs[arg_name] + + # add nameless args provided by *args (if exists) + new_args += nameless_args + + return_values = func(*new_args, **new_kwargs) + converter.set_template(template_arg_value) + + def recursive_recover(input_data): + if isinstance(input_data, (tuple, list)): + new_data = [] + for item in input_data: + new_data.append(recursive_recover(item)) + return tuple(new_data) if isinstance(input_data, + tuple) else new_data + elif isinstance(input_data, dict): + new_data = {} + for k, v in input_data.items(): + new_data[k] = recursive_recover(v) + return new_data + elif isinstance(input_data, (torch.Tensor, np.ndarray)): + return converter.recover(input_data) + else: + return input_data + + if recover: + return recursive_recover(return_values) + else: + return return_values + + return new_func + + return array_converter_wrapper + + +class ArrayConverter: + + SUPPORTED_NON_ARRAY_TYPES = (int, float, np.int8, np.int16, np.int32, + np.int64, np.uint8, np.uint16, np.uint32, + np.uint64, np.float16, np.float32, np.float64) + + def __init__(self, template_array=None): + if template_array is not None: + self.set_template(template_array) + + def set_template(self, array): + """Set template array. + + Args: + array (tuple | list | int | float | np.ndarray | torch.Tensor): + Template array. + + Raises: + ValueError: If input is list or tuple and cannot be converted to + to a NumPy array, a ValueError is raised. + TypeError: If input type does not belong to the above range, + or the contents of a list or tuple do not share the + same data type, a TypeError is raised. + """ + self.array_type = type(array) + self.is_num = False + self.device = 'cpu' + + if isinstance(array, np.ndarray): + self.dtype = array.dtype + elif isinstance(array, torch.Tensor): + self.dtype = array.dtype + self.device = array.device + elif isinstance(array, (list, tuple)): + try: + array = np.array(array) + if array.dtype not in self.SUPPORTED_NON_ARRAY_TYPES: + raise TypeError + self.dtype = array.dtype + except (ValueError, TypeError): + print(f'The following list cannot be converted to' + f' a numpy array of supported dtype:\n{array}') + raise + elif isinstance(array, self.SUPPORTED_NON_ARRAY_TYPES): + self.array_type = np.ndarray + self.is_num = True + self.dtype = np.dtype(type(array)) + else: + raise TypeError(f'Template type {self.array_type}' + f' is not supported.') + + def convert(self, input_array, target_type=None, target_array=None): + """Convert input array to target data type. + + Args: + input_array (tuple | list | np.ndarray | + torch.Tensor | int | float ): + Input array. Defaults to None. + target_type ( | , + optional): + Type to which input array is converted. Defaults to None. + target_array (np.ndarray | torch.Tensor, optional): + Template array to which input array is converted. + Defaults to None. + + Raises: + ValueError: If input is list or tuple and cannot be converted to + to a NumPy array, a ValueError is raised. + TypeError: If input type does not belong to the above range, + or the contents of a list or tuple do not share the + same data type, a TypeError is raised. + """ + if isinstance(input_array, (list, tuple)): + try: + input_array = np.array(input_array) + if input_array.dtype not in self.SUPPORTED_NON_ARRAY_TYPES: + raise TypeError + except (ValueError, TypeError): + print(f'The input cannot be converted to' + f' a single-type numpy array:\n{input_array}') + raise + elif isinstance(input_array, self.SUPPORTED_NON_ARRAY_TYPES): + input_array = np.array(input_array) + array_type = type(input_array) + assert target_type is not None or target_array is not None, \ + 'must specify a target' + if target_type is not None: + assert target_type in (np.ndarray, torch.Tensor), \ + 'invalid target type' + if target_type == array_type: + return input_array + elif target_type == np.ndarray: + # default dtype is float32 + converted_array = input_array.cpu().numpy().astype(np.float32) + else: + # default dtype is float32, device is 'cpu' + converted_array = torch.tensor( + input_array, dtype=torch.float32) + else: + assert isinstance(target_array, (np.ndarray, torch.Tensor)), \ + 'invalid target array type' + if isinstance(target_array, array_type): + return input_array + elif isinstance(target_array, np.ndarray): + converted_array = input_array.cpu().numpy().astype( + target_array.dtype) + else: + converted_array = target_array.new_tensor(input_array) + return converted_array + + def recover(self, input_array): + assert isinstance(input_array, (np.ndarray, torch.Tensor)), \ + 'invalid input array type' + if isinstance(input_array, self.array_type): + return input_array + elif isinstance(input_array, torch.Tensor): + converted_array = input_array.cpu().numpy().astype(self.dtype) + else: + converted_array = torch.tensor( + input_array, dtype=self.dtype, device=self.device) + if self.is_num: + converted_array = converted_array.item() + return converted_array diff --git a/mmdet3d/core/utils/gaussian.py b/mmdet3d/core/utils/gaussian.py index a07963e151..66ccbd9e76 100644 --- a/mmdet3d/core/utils/gaussian.py +++ b/mmdet3d/core/utils/gaussian.py @@ -8,7 +8,7 @@ def gaussian_2d(shape, sigma=1): Args: shape (list[int]): Shape of the map. - sigma (float): Sigma to generate gaussian map. + sigma (float, optional): Sigma to generate gaussian map. Defaults to 1. Returns: @@ -28,8 +28,8 @@ def draw_heatmap_gaussian(heatmap, center, radius, k=1): Args: heatmap (torch.Tensor): Heatmap to be masked. center (torch.Tensor): Center coord of the heatmap. - radius (int): Radius of gausian. - K (int): Multiple of masked_gaussian. Defaults to 1. + radius (int): Radius of gaussian. + K (int, optional): Multiple of masked_gaussian. Defaults to 1. Returns: torch.Tensor: Masked heatmap. @@ -59,7 +59,7 @@ def gaussian_radius(det_size, min_overlap=0.5): Args: det_size (tuple[torch.Tensor]): Size of the detection result. - min_overlap (float): Gaussian_overlap. Defaults to 0.5. + min_overlap (float, optional): Gaussian_overlap. Defaults to 0.5. Returns: torch.Tensor: Computed radius. @@ -84,3 +84,75 @@ def gaussian_radius(det_size, min_overlap=0.5): sq3 = torch.sqrt(b3**2 - 4 * a3 * c3) r3 = (b3 + sq3) / 2 return min(r1, r2, r3) + + +def get_ellip_gaussian_2D(heatmap, center, radius_x, radius_y, k=1): + """Generate 2D ellipse gaussian heatmap. + + Args: + heatmap (Tensor): Input heatmap, the gaussian kernel will cover on + it and maintain the max value. + center (list[int]): Coord of gaussian kernel's center. + radius_x (int): X-axis radius of gaussian kernel. + radius_y (int): Y-axis radius of gaussian kernel. + k (int, optional): Coefficient of gaussian kernel. Default: 1. + + Returns: + out_heatmap (Tensor): Updated heatmap covered by gaussian kernel. + """ + diameter_x, diameter_y = 2 * radius_x + 1, 2 * radius_y + 1 + gaussian_kernel = ellip_gaussian2D((radius_x, radius_y), + sigma_x=diameter_x / 6, + sigma_y=diameter_y / 6, + dtype=heatmap.dtype, + device=heatmap.device) + + x, y = int(center[0]), int(center[1]) + height, width = heatmap.shape[0:2] + + left, right = min(x, radius_x), min(width - x, radius_x + 1) + top, bottom = min(y, radius_y), min(height - y, radius_y + 1) + + masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right] + masked_gaussian = gaussian_kernel[radius_y - top:radius_y + bottom, + radius_x - left:radius_x + right] + out_heatmap = heatmap + torch.max( + masked_heatmap, + masked_gaussian * k, + out=out_heatmap[y - top:y + bottom, x - left:x + right]) + + return out_heatmap + + +def ellip_gaussian2D(radius, + sigma_x, + sigma_y, + dtype=torch.float32, + device='cpu'): + """Generate 2D ellipse gaussian kernel. + + Args: + radius (tuple(int)): Ellipse radius (radius_x, radius_y) of gaussian + kernel. + sigma_x (int): X-axis sigma of gaussian function. + sigma_y (int): Y-axis sigma of gaussian function. + dtype (torch.dtype, optional): Dtype of gaussian tensor. + Default: torch.float32. + device (str, optional): Device of gaussian tensor. + Default: 'cpu'. + + Returns: + h (Tensor): Gaussian kernel with a + ``(2 * radius_y + 1) * (2 * radius_x + 1)`` shape. + """ + x = torch.arange( + -radius[0], radius[0] + 1, dtype=dtype, device=device).view(1, -1) + y = torch.arange( + -radius[1], radius[1] + 1, dtype=dtype, device=device).view(-1, 1) + + h = (-(x * x) / (2 * sigma_x * sigma_x) - (y * y) / + (2 * sigma_y * sigma_y)).exp() + h[h < torch.finfo(h.dtype).eps * h.max()] = 0 + + return h diff --git a/mmdet3d/core/visualizer/image_vis.py b/mmdet3d/core/visualizer/image_vis.py index 891fb549c0..7ac765c207 100644 --- a/mmdet3d/core/visualizer/image_vis.py +++ b/mmdet3d/core/visualizer/image_vis.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import copy + import cv2 import numpy as np import torch @@ -18,7 +19,7 @@ def project_pts_on_img(points, raw_img (numpy.array): The numpy array of image. lidar2img_rt (numpy.array, shape=[4, 4]): The projection matrix according to the camera intrinsic parameters. - max_distance (float): the max distance of the points cloud. + max_distance (float, optional): the max distance of the points cloud. Default: 70. thickness (int, optional): The thickness of 2D points. Default: -1. """ @@ -69,7 +70,8 @@ def plot_rect3d_on_img(img, num_rects (int): Number of 3D rectangulars. rect_corners (numpy.array): Coordinates of the corners of 3D rectangulars. Should be in the shape of [num_rect, 8, 2]. - color (tuple[int]): The color to draw bboxes. Default: (0, 255, 0). + color (tuple[int], optional): The color to draw bboxes. + Default: (0, 255, 0). thickness (int, optional): The thickness of bboxes. Default: 1. """ line_indices = ((0, 1), (0, 3), (0, 4), (1, 2), (1, 5), (3, 2), (3, 7), @@ -99,7 +101,8 @@ def draw_lidar_bbox3d_on_img(bboxes3d, lidar2img_rt (numpy.array, shape=[4, 4]): The projection matrix according to the camera intrinsic parameters. img_metas (dict): Useless here. - color (tuple[int]): The color to draw bboxes. Default: (0, 255, 0). + color (tuple[int], optional): The color to draw bboxes. + Default: (0, 255, 0). thickness (int, optional): The thickness of bboxes. Default: 1. """ img = raw_img.copy() @@ -136,7 +139,8 @@ def draw_depth_bbox3d_on_img(bboxes3d, raw_img (numpy.array): The numpy array of image. calibs (dict): Camera calibration information, Rt and K. img_metas (dict): Used in coordinates transformation. - color (tuple[int]): The color to draw bboxes. Default: (0, 255, 0). + color (tuple[int], optional): The color to draw bboxes. + Default: (0, 255, 0). thickness (int, optional): The thickness of bboxes. Default: 1. """ from mmdet3d.core.bbox import points_cam2img @@ -176,7 +180,8 @@ def draw_camera_bbox3d_on_img(bboxes3d, cam2img (dict): Camera intrinsic matrix, denoted as `K` in depth bbox coordinate system. img_metas (dict): Useless here. - color (tuple[int]): The color to draw bboxes. Default: (0, 255, 0). + color (tuple[int], optional): The color to draw bboxes. + Default: (0, 255, 0). thickness (int, optional): The thickness of bboxes. Default: 1. """ from mmdet3d.core.bbox import points_cam2img @@ -188,7 +193,10 @@ def draw_camera_bbox3d_on_img(bboxes3d, points_3d = corners_3d.reshape(-1, 3) if not isinstance(cam2img, torch.Tensor): cam2img = torch.from_numpy(np.array(cam2img)) - cam2img = cam2img.reshape(3, 3).float().cpu() + + assert (cam2img.shape == torch.Size([3, 3]) + or cam2img.shape == torch.Size([4, 4])) + cam2img = cam2img.float().cpu() # project to 2d to get image coords (uv) uv_origin = points_cam2img(points_3d, cam2img) diff --git a/mmdet3d/core/visualizer/open3d_vis.py b/mmdet3d/core/visualizer/open3d_vis.py index 0790ee483a..be0648ed84 100644 --- a/mmdet3d/core/visualizer/open3d_vis.py +++ b/mmdet3d/core/visualizer/open3d_vis.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import copy + import numpy as np import torch @@ -22,12 +23,12 @@ def _draw_points(points, points (numpy.array | torch.tensor, shape=[N, 3+C]): points to visualize. vis (:obj:`open3d.visualization.Visualizer`): open3d visualizer. - points_size (int): the size of points to show on visualizer. + points_size (int, optional): the size of points to show on visualizer. Default: 2. - point_color (tuple[float]): the color of points. + point_color (tuple[float], optional): the color of points. Default: (0.5, 0.5, 0.5). - mode (str): indicate type of the input points, avaliable mode - ['xyz', 'xyzrgb']. Default: 'xyz'. + mode (str, optional): indicate type of the input points, + available mode ['xyz', 'xyzrgb']. Default: 'xyz'. Returns: tuple: points, color of each point. @@ -69,19 +70,21 @@ def _draw_bboxes(bbox3d, Args: bbox3d (numpy.array | torch.tensor, shape=[M, 7]): - 3d bbox (x, y, z, dx, dy, dz, yaw) to visualize. + 3d bbox (x, y, z, x_size, y_size, z_size, yaw) to visualize. vis (:obj:`open3d.visualization.Visualizer`): open3d visualizer. points_colors (numpy.array): color of each points. - pcd (:obj:`open3d.geometry.PointCloud`): point cloud. Default: None. - bbox_color (tuple[float]): the color of bbox. Default: (0, 1, 0). - points_in_box_color (tuple[float]): + pcd (:obj:`open3d.geometry.PointCloud`, optional): point cloud. + Default: None. + bbox_color (tuple[float], optional): the color of bbox. + Default: (0, 1, 0). + points_in_box_color (tuple[float], optional): the color of points inside bbox3d. Default: (1, 0, 0). - rot_axis (int): rotation axis of bbox. Default: 2. - center_mode (bool): indicate the center of bbox is bottom center - or gravity center. avaliable mode + rot_axis (int, optional): rotation axis of bbox. Default: 2. + center_mode (bool, optional): indicate the center of bbox is + bottom center or gravity center. available mode ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'. - mode (str): indicate type of the input points, avaliable mode - ['xyz', 'xyzrgb']. Default: 'xyz'. + mode (str, optional): indicate type of the input points, + available mode ['xyz', 'xyzrgb']. Default: 'xyz'. """ if isinstance(bbox3d, torch.Tensor): bbox3d = bbox3d.cpu().numpy() @@ -135,23 +138,27 @@ def show_pts_boxes(points, Args: points (numpy.array | torch.tensor, shape=[N, 3+C]): points to visualize. - bbox3d (numpy.array | torch.tensor, shape=[M, 7]): - 3d bbox (x, y, z, dx, dy, dz, yaw) to visualize. Default: None. - show (bool): whether to show the visualization results. Default: True. - save_path (str): path to save visualized results. Default: None. - points_size (int): the size of points to show on visualizer. + bbox3d (numpy.array | torch.tensor, shape=[M, 7], optional): + 3D bbox (x, y, z, x_size, y_size, z_size, yaw) to visualize. + Defaults to None. + show (bool, optional): whether to show the visualization results. + Default: True. + save_path (str, optional): path to save visualized results. + Default: None. + points_size (int, optional): the size of points to show on visualizer. Default: 2. - point_color (tuple[float]): the color of points. + point_color (tuple[float], optional): the color of points. Default: (0.5, 0.5, 0.5). - bbox_color (tuple[float]): the color of bbox. Default: (0, 1, 0). - points_in_box_color (tuple[float]): + bbox_color (tuple[float], optional): the color of bbox. + Default: (0, 1, 0). + points_in_box_color (tuple[float], optional): the color of points which are in bbox3d. Default: (1, 0, 0). - rot_axis (int): rotation axis of bbox. Default: 2. - center_mode (bool): indicate the center of bbox is bottom center - or gravity center. avaliable mode + rot_axis (int, optional): rotation axis of bbox. Default: 2. + center_mode (bool, optional): indicate the center of bbox is bottom + center or gravity center. available mode ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'. - mode (str): indicate type of the input points, avaliable mode - ['xyz', 'xyzrgb']. Default: 'xyz'. + mode (str, optional): indicate type of the input points, available + mode ['xyz', 'xyzrgb']. Default: 'xyz'. """ # TODO: support score and class info assert 0 <= rot_axis <= 2 @@ -196,21 +203,23 @@ def _draw_bboxes_ind(bbox3d, Args: bbox3d (numpy.array | torch.tensor, shape=[M, 7]): - 3d bbox (x, y, z, dx, dy, dz, yaw) to visualize. + 3d bbox (x, y, z, x_size, y_size, z_size, yaw) to visualize. vis (:obj:`open3d.visualization.Visualizer`): open3d visualizer. indices (numpy.array | torch.tensor, shape=[N, M]): indicate which bbox3d that each point lies in. points_colors (numpy.array): color of each points. - pcd (:obj:`open3d.geometry.PointCloud`): point cloud. Default: None. - bbox_color (tuple[float]): the color of bbox. Default: (0, 1, 0). - points_in_box_color (tuple[float]): + pcd (:obj:`open3d.geometry.PointCloud`, optional): point cloud. + Default: None. + bbox_color (tuple[float], optional): the color of bbox. + Default: (0, 1, 0). + points_in_box_color (tuple[float], optional): the color of points which are in bbox3d. Default: (1, 0, 0). - rot_axis (int): rotation axis of bbox. Default: 2. - center_mode (bool): indicate the center of bbox is bottom center - or gravity center. avaliable mode + rot_axis (int, optional): rotation axis of bbox. Default: 2. + center_mode (bool, optional): indicate the center of bbox is + bottom center or gravity center. available mode ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'. - mode (str): indicate type of the input points, avaliable mode - ['xyz', 'xyzrgb']. Default: 'xyz'. + mode (str, optional): indicate type of the input points, + available mode ['xyz', 'xyzrgb']. Default: 'xyz'. """ if isinstance(bbox3d, torch.Tensor): bbox3d = bbox3d.cpu().numpy() @@ -270,24 +279,28 @@ def show_pts_index_boxes(points, points (numpy.array | torch.tensor, shape=[N, 3+C]): points to visualize. bbox3d (numpy.array | torch.tensor, shape=[M, 7]): - 3d bbox (x, y, z, dx, dy, dz, yaw) to visualize. Default: None. - show (bool): whether to show the visualization results. Default: True. - indices (numpy.array | torch.tensor, shape=[N, M]): + 3D bbox (x, y, z, x_size, y_size, z_size, yaw) to visualize. + Defaults to None. + show (bool, optional): whether to show the visualization results. + Default: True. + indices (numpy.array | torch.tensor, shape=[N, M], optional): indicate which bbox3d that each point lies in. Default: None. - save_path (str): path to save visualized results. Default: None. - points_size (int): the size of points to show on visualizer. + save_path (str, optional): path to save visualized results. + Default: None. + points_size (int, optional): the size of points to show on visualizer. Default: 2. - point_color (tuple[float]): the color of points. + point_color (tuple[float], optional): the color of points. Default: (0.5, 0.5, 0.5). - bbox_color (tuple[float]): the color of bbox. Default: (0, 1, 0). - points_in_box_color (tuple[float]): + bbox_color (tuple[float], optional): the color of bbox. + Default: (0, 1, 0). + points_in_box_color (tuple[float], optional): the color of points which are in bbox3d. Default: (1, 0, 0). - rot_axis (int): rotation axis of bbox. Default: 2. - center_mode (bool): indicate the center of bbox is bottom center - or gravity center. avaliable mode + rot_axis (int, optional): rotation axis of bbox. Default: 2. + center_mode (bool, optional): indicate the center of bbox is + bottom center or gravity center. available mode ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'. - mode (str): indicate type of the input points, avaliable mode - ['xyz', 'xyzrgb']. Default: 'xyz'. + mode (str, optional): indicate type of the input points, + available mode ['xyz', 'xyzrgb']. Default: 'xyz'. """ # TODO: support score and class info assert 0 <= rot_axis <= 2 @@ -324,24 +337,27 @@ class Visualizer(object): points (numpy.array, shape=[N, 3+C]): Points to visualize. The Points cloud is in mode of Coord3DMode.DEPTH (please refer to core.structures.coord_3d_mode). - bbox3d (numpy.array, shape=[M, 7]): 3d bbox (x, y, z, dx, dy, dz, yaw) - to visualize. The 3d bbox is in mode of Box3DMode.DEPTH with + bbox3d (numpy.array, shape=[M, 7], optional): 3D bbox + (x, y, z, x_size, y_size, z_size, yaw) to visualize. + The 3D bbox is in mode of Box3DMode.DEPTH with gravity_center (please refer to core.structures.box_3d_mode). Default: None. - save_path (str): path to save visualized results. Default: None. - points_size (int): the size of points to show on visualizer. + save_path (str, optional): path to save visualized results. + Default: None. + points_size (int, optional): the size of points to show on visualizer. Default: 2. - point_color (tuple[float]): the color of points. + point_color (tuple[float], optional): the color of points. Default: (0.5, 0.5, 0.5). - bbox_color (tuple[float]): the color of bbox. Default: (0, 1, 0). - points_in_box_color (tuple[float]): + bbox_color (tuple[float], optional): the color of bbox. + Default: (0, 1, 0). + points_in_box_color (tuple[float], optional): the color of points which are in bbox3d. Default: (1, 0, 0). - rot_axis (int): rotation axis of bbox. Default: 2. - center_mode (bool): indicate the center of bbox is bottom center - or gravity center. avaliable mode + rot_axis (int, optional): rotation axis of bbox. Default: 2. + center_mode (bool, optional): indicate the center of bbox is + bottom center or gravity center. available mode ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'. - mode (str): indicate type of the input points, avaliable mode - ['xyz', 'xyzrgb']. Default: 'xyz'. + mode (str, optional): indicate type of the input points, + available mode ['xyz', 'xyzrgb']. Default: 'xyz'. """ def __init__(self, @@ -390,12 +406,13 @@ def add_bboxes(self, bbox3d, bbox_color=None, points_in_box_color=None): Args: bbox3d (numpy.array, shape=[M, 7]): - 3D bbox (x, y, z, dx, dy, dz, yaw) to be visualized. - The 3d bbox is in mode of Box3DMode.DEPTH with - gravity_center (please refer to core.structures.box_3d_mode). - bbox_color (tuple[float]): the color of bbox. Defaule: None. + 3D bbox (x, y, z, x_size, y_size, z_size, yaw) + to be visualized. The 3d bbox is in mode of + Box3DMode.DEPTH with gravity_center (please refer to + core.structures.box_3d_mode). + bbox_color (tuple[float]): the color of bbox. Default: None. points_in_box_color (tuple[float]): the color of points which - are in bbox3d. Defaule: None. + are in bbox3d. Default: None. """ if bbox_color is None: bbox_color = self.bbox_color @@ -431,7 +448,7 @@ def show(self, save_path=None): """Visualize the points cloud. Args: - save_path (str): path to save image. Default: None. + save_path (str, optional): path to save image. Default: None. """ self.o3d_visualizer.run() diff --git a/mmdet3d/core/visualizer/show_result.py b/mmdet3d/core/visualizer/show_result.py index b2d46cb0ec..aa732cf47b 100644 --- a/mmdet3d/core/visualizer/show_result.py +++ b/mmdet3d/core/visualizer/show_result.py @@ -1,8 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. +from os import path as osp + import mmcv import numpy as np import trimesh -from os import path as osp from .image_vis import (draw_camera_bbox3d_on_img, draw_depth_bbox3d_on_img, draw_lidar_bbox3d_on_img) @@ -35,7 +36,7 @@ def _write_oriented_bbox(scene_bbox, out_filename): Args: scene_bbox(list[ndarray] or ndarray): xyz pos of center and - 3 lengths (dx,dy,dz) and heading angle around Z axis. + 3 lengths (x_size, y_size, z_size) and heading angle around Z axis. Y forward, X right, Z upward. heading angle of positive X is 0, heading angle of positive Y is 90 degrees. out_filename(str): Filename. @@ -131,16 +132,14 @@ def show_result(points, if gt_bboxes is not None: # bottom center to gravity center gt_bboxes[..., 2] += gt_bboxes[..., 5] / 2 - # the positive direction for yaw in meshlab is clockwise - gt_bboxes[:, 6] *= -1 + _write_oriented_bbox(gt_bboxes, osp.join(result_path, f'{filename}_gt.obj')) if pred_bboxes is not None: # bottom center to gravity center pred_bboxes[..., 2] += pred_bboxes[..., 5] / 2 - # the positive direction for yaw in meshlab is clockwise - pred_bboxes[:, 6] *= -1 + _write_oriented_bbox(pred_bboxes, osp.join(result_path, f'{filename}_pred.obj')) @@ -152,7 +151,7 @@ def show_seg_result(points, filename, palette, ignore_index=None, - show=True, + show=False, snapshot=False): """Convert results into format that is directly readable for meshlab. @@ -163,10 +162,10 @@ def show_seg_result(points, out_dir (str): Path of output directory filename (str): Filename of the current frame. palette (np.ndarray): Mapping between class labels and colors. - ignore_index (int, optional): The label index to be ignored, e.g. \ + ignore_index (int, optional): The label index to be ignored, e.g. unannotated points. Defaults to None. show (bool, optional): Visualize the results online. Defaults to False. - snapshot (bool, optional): Whether to save the online results. \ + snapshot (bool, optional): Whether to save the online results. Defaults to False. """ # we need 3D coordinates to visualize segmentation mask @@ -226,7 +225,7 @@ def show_multi_modality_result(img, filename, box_mode='lidar', img_metas=None, - show=True, + show=False, gt_bbox_color=(61, 102, 255), pred_bbox_color=(241, 101, 72)): """Convert multi-modality detection results into 2D results. @@ -241,14 +240,16 @@ def show_multi_modality_result(img, according to the camera intrinsic parameters. out_dir (str): Path of output directory. filename (str): Filename of the current frame. - box_mode (str): Coordinate system the boxes are in. Should be one of - 'depth', 'lidar' and 'camera'. Defaults to 'lidar'. - img_metas (dict): Used in projecting depth bbox. - show (bool): Visualize the results online. Defaults to False. - gt_bbox_color (str or tuple(int)): Color of bbox lines. - The tuple of color should be in BGR order. Default: (255, 102, 61) - pred_bbox_color (str or tuple(int)): Color of bbox lines. - The tuple of color should be in BGR order. Default: (72, 101, 241) + box_mode (str, optional): Coordinate system the boxes are in. + Should be one of 'depth', 'lidar' and 'camera'. + Defaults to 'lidar'. + img_metas (dict, optional): Used in projecting depth bbox. + Defaults to None. + show (bool, optional): Visualize the results online. Defaults to False. + gt_bbox_color (str or tuple(int), optional): Color of bbox lines. + The tuple of color should be in BGR order. Default: (255, 102, 61). + pred_bbox_color (str or tuple(int), optional): Color of bbox lines. + The tuple of color should be in BGR order. Default: (72, 101, 241). """ if box_mode == 'depth': draw_bbox = draw_depth_bbox3d_on_img diff --git a/mmdet3d/core/voxel/voxel_generator.py b/mmdet3d/core/voxel/voxel_generator.py index 615b749558..404f2cdc9b 100644 --- a/mmdet3d/core/voxel/voxel_generator.py +++ b/mmdet3d/core/voxel/voxel_generator.py @@ -82,18 +82,18 @@ def points_to_voxel(points, """convert kitti points(N, >=3) to voxels. Args: - points (np.ndarray): [N, ndim]. points[:, :3] contain xyz points and \ + points (np.ndarray): [N, ndim]. points[:, :3] contain xyz points and points[:, 3:] contain other information such as reflectivity. voxel_size (list, tuple, np.ndarray): [3] xyz, indicate voxel size - coors_range (list[float | tuple[float] | ndarray]): Voxel range. \ + coors_range (list[float | tuple[float] | ndarray]): Voxel range. format: xyzxyz, minmax max_points (int): Indicate maximum points contained in a voxel. - reverse_index (bool): Whether return reversed coordinates. \ - if points has xyz format and reverse_index is True, output \ - coordinates will be zyx format, but points in features always \ + reverse_index (bool): Whether return reversed coordinates. + if points has xyz format and reverse_index is True, output + coordinates will be zyx format, but points in features always xyz format. - max_voxels (int): Maximum number of voxels this function creates. \ - For second, 20000 is a good choice. Points should be shuffled for \ + max_voxels (int): Maximum number of voxels this function creates. + For second, 20000 is a good choice. Points should be shuffled for randomness before this function because max_voxels drops points. Returns: @@ -147,20 +147,20 @@ def _points_to_voxel_reverse_kernel(points, """convert kitti points(N, >=3) to voxels. Args: - points (np.ndarray): [N, ndim]. points[:, :3] contain xyz points and \ + points (np.ndarray): [N, ndim]. points[:, :3] contain xyz points and points[:, 3:] contain other information such as reflectivity. - voxel_size (list, tuple, np.ndarray): [3] xyz, indicate voxel size \ - coors_range (list[float | tuple[float] | ndarray]): Range of voxels. \ + voxel_size (list, tuple, np.ndarray): [3] xyz, indicate voxel size + coors_range (list[float | tuple[float] | ndarray]): Range of voxels. format: xyzxyz, minmax num_points_per_voxel (int): Number of points per voxel. - coor_to_voxel_idx (np.ndarray): A voxel grid of shape (D, H, W), \ - which has the same shape as the complete voxel map. It indicates \ + coor_to_voxel_idx (np.ndarray): A voxel grid of shape (D, H, W), + which has the same shape as the complete voxel map. It indicates the index of each corresponding voxel. voxels (np.ndarray): Created empty voxels. coors (np.ndarray): Created coordinates of each voxel. max_points (int): Indicate maximum points contained in a voxel. - max_voxels (int): Maximum number of voxels this function create. \ - for second, 20000 is a good choice. Points should be shuffled for \ + max_voxels (int): Maximum number of voxels this function create. + for second, 20000 is a good choice. Points should be shuffled for randomness before this function because max_voxels drops points. Returns: @@ -221,20 +221,20 @@ def _points_to_voxel_kernel(points, """convert kitti points(N, >=3) to voxels. Args: - points (np.ndarray): [N, ndim]. points[:, :3] contain xyz points and \ + points (np.ndarray): [N, ndim]. points[:, :3] contain xyz points and points[:, 3:] contain other information such as reflectivity. voxel_size (list, tuple, np.ndarray): [3] xyz, indicate voxel size. - coors_range (list[float | tuple[float] | ndarray]): Range of voxels. \ + coors_range (list[float | tuple[float] | ndarray]): Range of voxels. format: xyzxyz, minmax num_points_per_voxel (int): Number of points per voxel. - coor_to_voxel_idx (np.ndarray): A voxel grid of shape (D, H, W), \ - which has the same shape as the complete voxel map. It indicates \ + coor_to_voxel_idx (np.ndarray): A voxel grid of shape (D, H, W), + which has the same shape as the complete voxel map. It indicates the index of each corresponding voxel. voxels (np.ndarray): Created empty voxels. coors (np.ndarray): Created coordinates of each voxel. max_points (int): Indicate maximum points contained in a voxel. - max_voxels (int): Maximum number of voxels this function create. \ - for second, 20000 is a good choice. Points should be shuffled for \ + max_voxels (int): Maximum number of voxels this function create. + for second, 20000 is a good choice. Points should be shuffled for randomness before this function because max_voxels drops points. Returns: diff --git a/mmdet3d/datasets/__init__.py b/mmdet3d/datasets/__init__.py index cb64c89d24..26ba3d73c9 100644 --- a/mmdet3d/datasets/__init__.py +++ b/mmdet3d/datasets/__init__.py @@ -9,14 +9,15 @@ from .nuscenes_dataset import NuScenesDataset from .nuscenes_mono_dataset import NuScenesMonoDataset # yapf: disable -from .pipelines import (BackgroundPointsFilter, GlobalAlignment, +from .pipelines import (AffineResize, BackgroundPointsFilter, GlobalAlignment, GlobalRotScaleTrans, IndoorPatchPointSample, IndoorPointSample, LoadAnnotations3D, - LoadPointsFromFile, LoadPointsFromMultiSweeps, - NormalizePointsColor, ObjectNameFilter, ObjectNoise, - ObjectRangeFilter, ObjectSample, PointSample, - PointShuffle, PointsRangeFilter, RandomDropPointsColor, - RandomFlip3D, RandomJitterPoints, + LoadPointsFromDict, LoadPointsFromFile, + LoadPointsFromMultiSweeps, NormalizePointsColor, + ObjectNameFilter, ObjectNoise, ObjectRangeFilter, + ObjectSample, PointSample, PointShuffle, + PointsRangeFilter, RandomDropPointsColor, RandomFlip3D, + RandomJitterPoints, RandomShiftScale, VoxelBasedPointSampler) # yapf: enable from .s3dis_dataset import S3DISDataset, S3DISSegDataset @@ -38,5 +39,6 @@ 'Custom3DDataset', 'Custom3DSegDataset', 'LoadPointsFromMultiSweeps', 'WaymoDataset', 'BackgroundPointsFilter', 'VoxelBasedPointSampler', 'get_loading_pipeline', 'RandomDropPointsColor', 'RandomJitterPoints', - 'ObjectNameFilter' + 'ObjectNameFilter', 'AffineResize', 'RandomShiftScale', + 'LoadPointsFromDict' ] diff --git a/mmdet3d/datasets/builder.py b/mmdet3d/datasets/builder.py index 00ebce0851..2bb09e9411 100644 --- a/mmdet3d/datasets/builder.py +++ b/mmdet3d/datasets/builder.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import platform + from mmcv.utils import Registry, build_from_cfg from mmdet.datasets import DATASETS diff --git a/mmdet3d/datasets/custom_3d.py b/mmdet3d/datasets/custom_3d.py index 3d17aeb7ca..1ba52c224e 100644 --- a/mmdet3d/datasets/custom_3d.py +++ b/mmdet3d/datasets/custom_3d.py @@ -1,9 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. -import mmcv -import numpy as np import tempfile import warnings from os import path as osp + +import mmcv +import numpy as np from torch.utils.data import Dataset from mmdet.datasets import DATASETS @@ -88,7 +89,7 @@ def get_data_info(self, index): index (int): Index of the sample data to get. Returns: - dict: Data information that will be passed to the data \ + dict: Data information that will be passed to the data preprocessing pipelines. It includes the following keys: - sample_idx (str): Sample index. @@ -177,7 +178,7 @@ def get_classes(cls, classes=None): """Get class names of current dataset. Args: - classes (Sequence[str] | str | None): If classes is None, use + classes (Sequence[str] | str): If classes is None, use default CLASSES defined by builtin dataset. If classes is a string, take it as a file name. The file contains the name of classes where each line contains one class name. If classes is @@ -207,13 +208,13 @@ def format_results(self, Args: outputs (list[dict]): Testing results of the dataset. - pklfile_prefix (str | None): The prefix of pkl files. It includes + pklfile_prefix (str): The prefix of pkl files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. Returns: - tuple: (outputs, tmp_dir), outputs is the detection results, \ - tmp_dir is the temporal directory created for saving json \ + tuple: (outputs, tmp_dir), outputs is the detection results, + tmp_dir is the temporal directory created for saving json files when ``jsonfile_prefix`` is not specified. """ if pklfile_prefix is None: @@ -237,11 +238,14 @@ def evaluate(self, Args: results (list[dict]): List of results. - metric (str | list[str]): Metrics to be evaluated. - iou_thr (list[float]): AP IoU thresholds. - show (bool): Whether to visualize. + metric (str | list[str], optional): Metrics to be evaluated. + Defaults to None. + iou_thr (list[float]): AP IoU thresholds. Defaults to (0.25, 0.5). + logger (logging.Logger | str, optional): Logger used for printing + related information during evaluation. Defaults to None. + show (bool, optional): Whether to visualize. Default: False. - out_dir (str): Path to save the visualization results. + out_dir (str, optional): Path to save the visualization results. Default: None. pipeline (list[dict], optional): raw data loading for showing. Default: None. @@ -281,7 +285,7 @@ def _get_pipeline(self, pipeline): """Get data loading pipeline in self.show/evaluate function. Args: - pipeline (list[dict] | None): Input pipeline. If None is given, \ + pipeline (list[dict]): Input pipeline. If None is given, get from self.pipeline. """ if pipeline is None: diff --git a/mmdet3d/datasets/custom_3d_seg.py b/mmdet3d/datasets/custom_3d_seg.py index c07aa675a4..bfa89a4713 100644 --- a/mmdet3d/datasets/custom_3d_seg.py +++ b/mmdet3d/datasets/custom_3d_seg.py @@ -1,9 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. -import mmcv -import numpy as np import tempfile import warnings from os import path as osp + +import mmcv +import numpy as np from torch.utils.data import Dataset from mmdet.datasets import DATASETS @@ -32,7 +33,7 @@ class Custom3DSegDataset(Dataset): as input. Defaults to None. test_mode (bool, optional): Whether the dataset is in test mode. Defaults to False. - ignore_index (int, optional): The label index to be ignored, e.g. \ + ignore_index (int, optional): The label index to be ignored, e.g. unannotated points. If None is given, set to len(self.CLASSES) to be consistent with PointSegClassMapping function in pipeline. Defaults to None. @@ -102,7 +103,7 @@ def get_data_info(self, index): index (int): Index of the sample data to get. Returns: - dict: Data information that will be passed to the data \ + dict: Data information that will be passed to the data preprocessing pipelines. It includes the following keys: - sample_idx (str): Sample index. @@ -179,13 +180,13 @@ def get_classes_and_palette(self, classes=None, palette=None): This function is taken from MMSegmentation. Args: - classes (Sequence[str] | str | None): If classes is None, use + classes (Sequence[str] | str): If classes is None, use default CLASSES defined by builtin dataset. If classes is a string, take it as a file name. The file contains the name of classes where each line contains one class name. If classes is a tuple or list, override the CLASSES defined by the dataset. Defaults to None. - palette (Sequence[Sequence[int]]] | np.ndarray | None): + palette (Sequence[Sequence[int]]] | np.ndarray): The palette of segmentation map. If None is given, random palette will be generated. Defaults to None. """ @@ -276,13 +277,13 @@ def format_results(self, Args: outputs (list[dict]): Testing results of the dataset. - pklfile_prefix (str | None): The prefix of pkl files. It includes + pklfile_prefix (str): The prefix of pkl files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. Returns: - tuple: (outputs, tmp_dir), outputs is the detection results, \ - tmp_dir is the temporal directory created for saving json \ + tuple: (outputs, tmp_dir), outputs is the detection results, + tmp_dir is the temporal directory created for saving json files when ``jsonfile_prefix`` is not specified. """ if pklfile_prefix is None: @@ -306,7 +307,7 @@ def evaluate(self, Args: results (list[dict]): List of results. metric (str | list[str]): Metrics to be evaluated. - logger (logging.Logger | None | str): Logger used for printing + logger (logging.Logger | str, optional): Logger used for printing related information during evaluation. Defaults to None. show (bool, optional): Whether to visualize. Defaults to False. @@ -364,7 +365,7 @@ def _get_pipeline(self, pipeline): """Get data loading pipeline in self.show/evaluate function. Args: - pipeline (list[dict] | None): Input pipeline. If None is given, \ + pipeline (list[dict]): Input pipeline. If None is given, get from self.pipeline. """ if pipeline is None: diff --git a/mmdet3d/datasets/kitti2d_dataset.py b/mmdet3d/datasets/kitti2d_dataset.py index 9254bcd6b8..65d08eca11 100644 --- a/mmdet3d/datasets/kitti2d_dataset.py +++ b/mmdet3d/datasets/kitti2d_dataset.py @@ -206,7 +206,8 @@ def reformat_bbox(self, outputs, out=None): Args: outputs (list[np.ndarray]): List of arrays storing the inferenced bounding boxes and scores. - out (str | None): The prefix of output file. Default: None. + out (str, optional): The prefix of output file. + Default: None. Returns: list[dict]: A list of dictionaries with the kitti 2D format. @@ -222,7 +223,7 @@ def evaluate(self, result_files, eval_types=None): Args: result_files (str): Path of result files. - eval_types (str): Types of evaluation. Default: None. + eval_types (str, optional): Types of evaluation. Default: None. KITTI dataset only support 'bbox' evaluation type. Returns: diff --git a/mmdet3d/datasets/kitti_dataset.py b/mmdet3d/datasets/kitti_dataset.py index bd8e3b58a2..9a864efff2 100644 --- a/mmdet3d/datasets/kitti_dataset.py +++ b/mmdet3d/datasets/kitti_dataset.py @@ -1,12 +1,13 @@ # Copyright (c) OpenMMLab. All rights reserved. import copy -import mmcv -import numpy as np import os import tempfile +from os import path as osp + +import mmcv +import numpy as np import torch from mmcv.utils import print_log -from os import path as osp from mmdet.datasets import DATASETS from ..core import show_multi_modality_result, show_result @@ -47,8 +48,9 @@ class KittiDataset(Custom3DDataset): Defaults to True. test_mode (bool, optional): Whether the dataset is in test mode. Defaults to False. - pcd_limit_range (list): The range of point cloud used to filter - invalid predicted boxes. Default: [0, -40, -3, 70.4, 40, 0.0]. + pcd_limit_range (list, optional): The range of point cloud used to + filter invalid predicted boxes. + Default: [0, -40, -3, 70.4, 40, 0.0]. """ CLASSES = ('car', 'pedestrian', 'cyclist') @@ -100,14 +102,14 @@ def get_data_info(self, index): index (int): Index of the sample data to get. Returns: - dict: Data information that will be passed to the data \ + dict: Data information that will be passed to the data preprocessing pipelines. It includes the following keys: - sample_idx (str): Sample index. - pts_filename (str): Filename of point clouds. - - img_prefix (str | None): Prefix of image files. + - img_prefix (str): Prefix of image files. - img_info (dict): Image info. - - lidar2img (list[np.ndarray], optional): Transformations \ + - lidar2img (list[np.ndarray], optional): Transformations from lidar to different cameras. - ann_info (dict): Annotation info. """ @@ -145,19 +147,38 @@ def get_ann_info(self, index): Returns: dict: annotation information consists of the following keys: - - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): \ + - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): 3D ground truth bboxes. - gt_labels_3d (np.ndarray): Labels of ground truths. - gt_bboxes (np.ndarray): 2D ground truth bboxes. - gt_labels (np.ndarray): Labels of ground truths. - gt_names (list[str]): Class names of ground truths. - - difficulty (int): kitti difficulty. + - difficulty (int): Difficulty defined by KITTI. + 0, 1, 2 represent xxxxx respectively. """ # Use index to get the annos, thus the evalhook could also use this api info = self.data_infos[index] rect = info['calib']['R0_rect'].astype(np.float32) Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32) + if 'plane' in info: + # convert ground plane to velodyne coordinates + reverse = np.linalg.inv(rect @ Trv2c) + + (plane_norm_cam, + plane_off_cam) = (info['plane'][:3], + -info['plane'][:3] * info['plane'][3]) + plane_norm_lidar = \ + (reverse[:3, :3] @ plane_norm_cam[:, None])[:, 0] + plane_off_lidar = ( + reverse[:3, :3] @ plane_off_cam[:, None][:, 0] + + reverse[:3, 3]) + plane_lidar = np.zeros_like(plane_norm_lidar, shape=(4, )) + plane_lidar[:3] = plane_norm_lidar + plane_lidar[3] = -plane_norm_lidar.T @ plane_off_lidar + else: + plane_lidar = None + difficulty = info['annos']['difficulty'] annos = info['annos'] # we need other objects to avoid collision when sample @@ -193,6 +214,7 @@ def get_ann_info(self, index): bboxes=gt_bboxes, labels=gt_labels, gt_names=gt_names, + plane=plane_lidar, difficulty=difficulty) return anns_results @@ -251,17 +273,17 @@ def format_results(self, Args: outputs (list[dict]): Testing results of the dataset. - pklfile_prefix (str | None): The prefix of pkl files. It includes + pklfile_prefix (str): The prefix of pkl files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. - submission_prefix (str | None): The prefix of submitted files. It + submission_prefix (str): The prefix of submitted files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. Returns: - tuple: (result_files, tmp_dir), result_files is a dict containing \ - the json filepaths, tmp_dir is the temporal directory created \ + tuple: (result_files, tmp_dir), result_files is a dict containing + the json filepaths, tmp_dir is the temporal directory created for saving json files when jsonfile_prefix is not specified. """ if pklfile_prefix is None: @@ -311,17 +333,19 @@ def evaluate(self, Args: results (list[dict]): Testing results of the dataset. - metric (str | list[str]): Metrics to be evaluated. - logger (logging.Logger | str | None): Logger used for printing + metric (str | list[str], optional): Metrics to be evaluated. + Default: None. + logger (logging.Logger | str, optional): Logger used for printing related information during evaluation. Default: None. - pklfile_prefix (str | None): The prefix of pkl files. It includes + pklfile_prefix (str, optional): The prefix of pkl files, including the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. - submission_prefix (str | None): The prefix of submission data. + submission_prefix (str, optional): The prefix of submission data. If not specified, the submission data will not be generated. - show (bool): Whether to visualize. + Default: None. + show (bool, optional): Whether to visualize. Default: False. - out_dir (str): Path to save the visualization results. + out_dir (str, optional): Path to save the visualization results. Default: None. pipeline (list[dict], optional): raw data loading for showing. Default: None. @@ -361,8 +385,8 @@ def evaluate(self, if tmp_dir is not None: tmp_dir.cleanup() - if show: - self.show(results, out_dir, pipeline=pipeline) + if show or out_dir: + self.show(results, out_dir, show=show, pipeline=pipeline) return ap_dict def bbox2result_kitti(self, @@ -374,11 +398,11 @@ def bbox2result_kitti(self, submission. Args: - net_outputs (list[np.ndarray]): List of array storing the \ + net_outputs (list[np.ndarray]): List of array storing the inferenced bounding boxes and scores. class_names (list[String]): A list of class names. - pklfile_prefix (str | None): The prefix of pkl file. - submission_prefix (str | None): The prefix of submission file. + pklfile_prefix (str): The prefix of pkl file. + submission_prefix (str): The prefix of submission file. Returns: list[dict]: A list of dictionaries with the kitti format. @@ -489,11 +513,11 @@ def bbox2result_kitti2d(self, submission. Args: - net_outputs (list[np.ndarray]): List of array storing the \ + net_outputs (list[np.ndarray]): List of array storing the inferenced bounding boxes and scores. class_names (list[String]): A list of class names. - pklfile_prefix (str | None): The prefix of pkl file. - submission_prefix (str | None): The prefix of submission file. + pklfile_prefix (str): The prefix of pkl file. + submission_prefix (str): The prefix of submission file. Returns: list[dict]: A list of dictionaries have the kitti format @@ -607,9 +631,9 @@ def convert_valid_bboxes(self, box_dict, info): dict: Valid predicted boxes. - bbox (np.ndarray): 2D bounding boxes. - - box3d_camera (np.ndarray): 3D bounding boxes in \ + - box3d_camera (np.ndarray): 3D bounding boxes in camera coordinate. - - box3d_lidar (np.ndarray): 3D bounding boxes in \ + - box3d_lidar (np.ndarray): 3D bounding boxes in LiDAR coordinate. - scores (np.ndarray): Scores of boxes. - label_preds (np.ndarray): Class label predictions. @@ -620,8 +644,6 @@ def convert_valid_bboxes(self, box_dict, info): scores = box_dict['scores_3d'] labels = box_dict['labels_3d'] sample_idx = info['image']['image_idx'] - # TODO: remove the hack of yaw - box_preds.tensor[:, -1] = box_preds.tensor[:, -1] - np.pi box_preds.limit_yaw(offset=0.5, period=np.pi * 2) if len(box_preds) == 0: @@ -701,7 +723,8 @@ def show(self, results, out_dir, show=True, pipeline=None): Args: results (list[dict]): List of bounding boxes results. out_dir (str): Output directory of visualization result. - show (bool): Visualize the results online. + show (bool): Whether to visualize the results online. + Default: False. pipeline (list[dict], optional): raw data loading for showing. Default: None. """ diff --git a/mmdet3d/datasets/kitti_mono_dataset.py b/mmdet3d/datasets/kitti_mono_dataset.py index ba77690f5f..83f0f2c62a 100644 --- a/mmdet3d/datasets/kitti_mono_dataset.py +++ b/mmdet3d/datasets/kitti_mono_dataset.py @@ -1,11 +1,12 @@ # Copyright (c) OpenMMLab. All rights reserved. import copy +import tempfile +from os import path as osp + import mmcv import numpy as np -import tempfile import torch from mmcv.utils import print_log -from os import path as osp from mmdet.datasets import DATASETS from ..core.bbox import Box3DMode, CameraInstance3DBoxes, points_cam2img @@ -57,8 +58,8 @@ def _parse_ann_info(self, img_info, ann_info): with_mask (bool): Whether to parse mask annotations. Returns: - dict: A dict containing the following keys: bboxes, bboxes_ignore,\ - labels, masks, seg_map. "masks" are raw annotations and not \ + dict: A dict containing the following keys: bboxes, bboxes_ignore, + labels, masks, seg_map. "masks" are raw annotations and not decoded into binary masks. """ gt_bboxes = [] @@ -147,17 +148,17 @@ def format_results(self, Args: outputs (list[dict]): Testing results of the dataset. - pklfile_prefix (str | None): The prefix of pkl files. It includes + pklfile_prefix (str): The prefix of pkl files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. - submission_prefix (str | None): The prefix of submitted files. It + submission_prefix (str): The prefix of submitted files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. Returns: - tuple: (result_files, tmp_dir), result_files is a dict containing \ - the json filepaths, tmp_dir is the temporal directory created \ + tuple: (result_files, tmp_dir), result_files is a dict containing + the json filepaths, tmp_dir is the temporal directory created for saving json files when jsonfile_prefix is not specified. """ if pklfile_prefix is None: @@ -202,22 +203,26 @@ def evaluate(self, pklfile_prefix=None, submission_prefix=None, show=False, - out_dir=None): + out_dir=None, + pipeline=None): """Evaluation in KITTI protocol. Args: results (list[dict]): Testing results of the dataset. - metric (str | list[str]): Metrics to be evaluated. - logger (logging.Logger | str | None): Logger used for printing + metric (str | list[str], optional): Metrics to be evaluated. + Defaults to None. + logger (logging.Logger | str, optional): Logger used for printing related information during evaluation. Default: None. - pklfile_prefix (str | None): The prefix of pkl files. It includes + pklfile_prefix (str, optional): The prefix of pkl files, including the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. - submission_prefix (str | None): The prefix of submission datas. + submission_prefix (str, optional): The prefix of submission data. If not specified, the submission data will not be generated. - show (bool): Whether to visualize. + show (bool, optional): Whether to visualize. Default: False. - out_dir (str): Path to save the visualization results. + out_dir (str, optional): Path to save the visualization results. + Default: None. + pipeline (list[dict], optional): raw data loading for showing. Default: None. Returns: @@ -255,8 +260,8 @@ def evaluate(self, if tmp_dir is not None: tmp_dir.cleanup() - if show: - self.show(results, out_dir) + if show or out_dir: + self.show(results, out_dir, show=show, pipeline=pipeline) return ap_dict def bbox2result_kitti(self, @@ -268,11 +273,11 @@ def bbox2result_kitti(self, submission. Args: - net_outputs (list[np.ndarray]): List of array storing the \ + net_outputs (list[np.ndarray]): List of array storing the inferenced bounding boxes and scores. class_names (list[String]): A list of class names. - pklfile_prefix (str | None): The prefix of pkl file. - submission_prefix (str | None): The prefix of submission file. + pklfile_prefix (str): The prefix of pkl file. + submission_prefix (str): The prefix of submission file. Returns: list[dict]: A list of dictionaries with the kitti format. @@ -383,11 +388,11 @@ def bbox2result_kitti2d(self, submission. Args: - net_outputs (list[np.ndarray]): List of array storing the \ + net_outputs (list[np.ndarray]): List of array storing the inferenced bounding boxes and scores. class_names (list[String]): A list of class names. - pklfile_prefix (str | None): The prefix of pkl file. - submission_prefix (str | None): The prefix of submission file. + pklfile_prefix (str): The prefix of pkl file. + submission_prefix (str): The prefix of submission file. Returns: list[dict]: A list of dictionaries have the kitti format @@ -498,7 +503,7 @@ def convert_valid_bboxes(self, box_dict, info): Returns: dict: Valid predicted boxes. - bbox (np.ndarray): 2D bounding boxes. - - box3d_camera (np.ndarray): 3D bounding boxes in \ + - box3d_camera (np.ndarray): 3D bounding boxes in camera coordinate. - scores (np.ndarray): Scores of boxes. - label_preds (np.ndarray): Class label predictions. diff --git a/mmdet3d/datasets/lyft_dataset.py b/mmdet3d/datasets/lyft_dataset.py index 699c2bfe3a..f775cf4fc3 100644 --- a/mmdet3d/datasets/lyft_dataset.py +++ b/mmdet3d/datasets/lyft_dataset.py @@ -1,12 +1,13 @@ # Copyright (c) OpenMMLab. All rights reserved. +import os +import tempfile +from os import path as osp + import mmcv import numpy as np -import os import pandas as pd -import tempfile from lyft_dataset_sdk.lyftdataset import LyftDataset as Lyft from lyft_dataset_sdk.utils.data_classes import Box as LyftBox -from os import path as osp from pyquaternion import Quaternion from mmdet3d.core.evaluation.lyft_eval import lyft_eval @@ -129,7 +130,7 @@ def get_data_info(self, index): index (int): Index of the sample data to get. Returns: - dict: Data information that will be passed to the data \ + dict: Data information that will be passed to the data preprocessing pipelines. It includes the following keys: - sample_idx (str): sample index @@ -137,13 +138,13 @@ def get_data_info(self, index): - sweeps (list[dict]): infos of sweeps - timestamp (float): sample timestamp - img_filename (str, optional): image filename - - lidar2img (list[np.ndarray], optional): transformations \ + - lidar2img (list[np.ndarray], optional): transformations from lidar to different cameras - ann_info (dict): annotation info """ info = self.data_infos[index] - # standard protocal modified from SECOND.Pytorch + # standard protocol modified from SECOND.Pytorch input_dict = dict( sample_idx=info['token'], pts_filename=info['lidar_path'], @@ -190,7 +191,7 @@ def get_ann_info(self, index): Returns: dict: Annotation information consists of the following keys: - - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): \ + - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): 3D ground truth bboxes. - gt_labels_3d (np.ndarray): Labels of ground truths. - gt_names (list[str]): Class names of ground truths. @@ -275,10 +276,11 @@ def _evaluate_single(self, Args: result_path (str): Path of the result file. - logger (logging.Logger | str | None): Logger used for printing + logger (logging.Logger | str, optional): Logger used for printing related information during evaluation. Default: None. - metric (str): Metric name used for evaluation. Default: 'bbox'. - result_name (str): Result name in the metric prefix. + metric (str, optional): Metric name used for evaluation. + Default: 'bbox'. + result_name (str, optional): Result name in the metric prefix. Default: 'pts_bbox'. Returns: @@ -312,18 +314,18 @@ def format_results(self, results, jsonfile_prefix=None, csv_savepath=None): Args: results (list[dict]): Testing results of the dataset. - jsonfile_prefix (str | None): The prefix of json files. It includes + jsonfile_prefix (str): The prefix of json files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. - csv_savepath (str | None): The path for saving csv files. + csv_savepath (str): The path for saving csv files. It includes the file path and the csv filename, e.g., "a/b/filename.csv". If not specified, the result will not be converted to csv file. Returns: - tuple: Returns (result_files, tmp_dir), where `result_files` is a \ - dict containing the json filepaths, `tmp_dir` is the temporal \ - directory created for saving json files when \ + tuple: Returns (result_files, tmp_dir), where `result_files` is a + dict containing the json filepaths, `tmp_dir` is the temporal + directory created for saving json files when `jsonfile_prefix` is not specified. """ assert isinstance(results, list), 'results must be a list' @@ -372,19 +374,22 @@ def evaluate(self, Args: results (list[dict]): Testing results of the dataset. - metric (str | list[str]): Metrics to be evaluated. - logger (logging.Logger | str | None): Logger used for printing + metric (str | list[str], optional): Metrics to be evaluated. + Default: 'bbox'. + logger (logging.Logger | str, optional): Logger used for printing related information during evaluation. Default: None. - jsonfile_prefix (str | None): The prefix of json files. It includes + jsonfile_prefix (str, optional): The prefix of json files including the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. - csv_savepath (str | None): The path for saving csv files. + csv_savepath (str, optional): The path for saving csv files. It includes the file path and the csv filename, e.g., "a/b/filename.csv". If not specified, the result will not be converted to csv file. - show (bool): Whether to visualize. + result_names (list[str], optional): Result names in the + metric prefix. Default: ['pts_bbox']. + show (bool, optional): Whether to visualize. Default: False. - out_dir (str): Path to save the visualization results. + out_dir (str, optional): Path to save the visualization results. Default: None. pipeline (list[dict], optional): raw data loading for showing. Default: None. @@ -407,8 +412,8 @@ def evaluate(self, if tmp_dir is not None: tmp_dir.cleanup() - if show: - self.show(results, out_dir, pipeline=pipeline) + if show or out_dir: + self.show(results, out_dir, show=show, pipeline=pipeline) return results_dict def _build_default_pipeline(self): @@ -432,13 +437,14 @@ def _build_default_pipeline(self): ] return Compose(pipeline) - def show(self, results, out_dir, show=True, pipeline=None): + def show(self, results, out_dir, show=False, pipeline=None): """Results visualization. Args: results (list[dict]): List of bounding boxes results. out_dir (str): Output directory of visualization result. - show (bool): Visualize the results online. + show (bool): Whether to visualize the results online. + Default: False. pipeline (list[dict], optional): raw data loading for showing. Default: None. """ @@ -517,16 +523,16 @@ def output_to_lyft_box(detection): box_gravity_center = box3d.gravity_center.numpy() box_dims = box3d.dims.numpy() box_yaw = box3d.yaw.numpy() - # TODO: check whether this is necessary - # with dir_offset & dir_limit in the head - box_yaw = -box_yaw - np.pi / 2 + + # our LiDAR coordinate system -> Lyft box coordinate system + lyft_box_dims = box_dims[:, [1, 0, 2]] box_list = [] for i in range(len(box3d)): quat = Quaternion(axis=[0, 0, 1], radians=box_yaw[i]) box = LyftBox( box_gravity_center[i], - box_dims[i], + lyft_box_dims[i], quat, label=labels[i], score=scores[i]) diff --git a/mmdet3d/datasets/nuscenes_dataset.py b/mmdet3d/datasets/nuscenes_dataset.py index f924697377..b02d07e4bc 100644 --- a/mmdet3d/datasets/nuscenes_dataset.py +++ b/mmdet3d/datasets/nuscenes_dataset.py @@ -1,10 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. +import tempfile +from os import path as osp + import mmcv import numpy as np import pyquaternion -import tempfile from nuscenes.utils.data_classes import Box as NuScenesBox -from os import path as osp from mmdet.datasets import DATASETS from ..core import show_result @@ -48,8 +49,9 @@ class NuScenesDataset(Custom3DDataset): Defaults to False. eval_version (bool, optional): Configuration version of evaluation. Defaults to 'detection_cvpr_2019'. - use_valid_flag (bool): Whether to use `use_valid_flag` key in the info - file as mask to filter gt_boxes and gt_names. Defaults to False. + use_valid_flag (bool, optional): Whether to use `use_valid_flag` key + in the info file as mask to filter gt_boxes and gt_names. + Defaults to False. """ NameMapping = { 'movable_object.barrier': 'barrier', @@ -196,7 +198,7 @@ def get_data_info(self, index): index (int): Index of the sample data to get. Returns: - dict: Data information that will be passed to the data \ + dict: Data information that will be passed to the data preprocessing pipelines. It includes the following keys: - sample_idx (str): Sample index. @@ -204,12 +206,12 @@ def get_data_info(self, index): - sweeps (list[dict]): Infos of sweeps. - timestamp (float): Sample timestamp. - img_filename (str, optional): Image filename. - - lidar2img (list[np.ndarray], optional): Transformations \ + - lidar2img (list[np.ndarray], optional): Transformations from lidar to different cameras. - ann_info (dict): Annotation info. """ info = self.data_infos[index] - # standard protocal modified from SECOND.Pytorch + # standard protocol modified from SECOND.Pytorch input_dict = dict( sample_idx=info['token'], pts_filename=info['lidar_path'], @@ -256,7 +258,7 @@ def get_ann_info(self, index): Returns: dict: Annotation information consists of the following keys: - - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): \ + - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): 3D ground truth bboxes - gt_labels_3d (np.ndarray): Labels of ground truths. - gt_names (list[str]): Class names of ground truths. @@ -374,10 +376,11 @@ def _evaluate_single(self, Args: result_path (str): Path of the result file. - logger (logging.Logger | str | None): Logger used for printing + logger (logging.Logger | str, optional): Logger used for printing related information during evaluation. Default: None. - metric (str): Metric name used for evaluation. Default: 'bbox'. - result_name (str): Result name in the metric prefix. + metric (str, optional): Metric name used for evaluation. + Default: 'bbox'. + result_name (str, optional): Result name in the metric prefix. Default: 'pts_bbox'. Returns: @@ -427,14 +430,14 @@ def format_results(self, results, jsonfile_prefix=None): Args: results (list[dict]): Testing results of the dataset. - jsonfile_prefix (str | None): The prefix of json files. It includes + jsonfile_prefix (str): The prefix of json files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. Returns: - tuple: Returns (result_files, tmp_dir), where `result_files` is a \ - dict containing the json filepaths, `tmp_dir` is the temporal \ - directory created for saving json files when \ + tuple: Returns (result_files, tmp_dir), where `result_files` is a + dict containing the json filepaths, `tmp_dir` is the temporal + directory created for saving json files when `jsonfile_prefix` is not specified. """ assert isinstance(results, list), 'results must be a list' @@ -480,15 +483,16 @@ def evaluate(self, Args: results (list[dict]): Testing results of the dataset. - metric (str | list[str]): Metrics to be evaluated. - logger (logging.Logger | str | None): Logger used for printing + metric (str | list[str], optional): Metrics to be evaluated. + Default: 'bbox'. + logger (logging.Logger | str, optional): Logger used for printing related information during evaluation. Default: None. - jsonfile_prefix (str | None): The prefix of json files. It includes + jsonfile_prefix (str, optional): The prefix of json files including the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. - show (bool): Whether to visualize. + show (bool, optional): Whether to visualize. Default: False. - out_dir (str): Path to save the visualization results. + out_dir (str, optional): Path to save the visualization results. Default: None. pipeline (list[dict], optional): raw data loading for showing. Default: None. @@ -510,8 +514,8 @@ def evaluate(self, if tmp_dir is not None: tmp_dir.cleanup() - if show: - self.show(results, out_dir, pipeline=pipeline) + if show or out_dir: + self.show(results, out_dir, show=show, pipeline=pipeline) return results_dict def _build_default_pipeline(self): @@ -535,13 +539,14 @@ def _build_default_pipeline(self): ] return Compose(pipeline) - def show(self, results, out_dir, show=True, pipeline=None): + def show(self, results, out_dir, show=False, pipeline=None): """Results visualization. Args: results (list[dict]): List of bounding boxes results. out_dir (str): Output directory of visualization result. - show (bool): Visualize the results online. + show (bool): Whether to visualize the results online. + Default: False. pipeline (list[dict], optional): raw data loading for showing. Default: None. """ @@ -588,9 +593,9 @@ def output_to_nusc_box(detection): box_gravity_center = box3d.gravity_center.numpy() box_dims = box3d.dims.numpy() box_yaw = box3d.yaw.numpy() - # TODO: check whether this is necessary - # with dir_offset & dir_limit in the head - box_yaw = -box_yaw - np.pi / 2 + + # our LiDAR coordinate system -> nuScenes box coordinate system + nus_box_dims = box_dims[:, [1, 0, 2]] box_list = [] for i in range(len(box3d)): @@ -602,7 +607,7 @@ def output_to_nusc_box(detection): # velo_val * np.cos(velo_ori), velo_val * np.sin(velo_ori), 0.0) box = NuScenesBox( box_gravity_center[i], - box_dims[i], + nus_box_dims[i], quat, label=labels[i], score=scores[i], @@ -624,7 +629,7 @@ def lidar_nusc_box_to_global(info, boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes. classes (list[str]): Mapped classes in the evaluation. eval_configs (object): Evaluation configuration object. - eval_version (str): Evaluation version. + eval_version (str, optional): Evaluation version. Default: 'detection_cvpr_2019' Returns: diff --git a/mmdet3d/datasets/nuscenes_mono_dataset.py b/mmdet3d/datasets/nuscenes_mono_dataset.py index 181b7943be..dd9a07d289 100644 --- a/mmdet3d/datasets/nuscenes_mono_dataset.py +++ b/mmdet3d/datasets/nuscenes_mono_dataset.py @@ -1,13 +1,14 @@ # Copyright (c) OpenMMLab. All rights reserved. import copy +import tempfile +import warnings +from os import path as osp + import mmcv import numpy as np import pyquaternion -import tempfile import torch -import warnings from nuscenes.utils.data_classes import Box as NuScenesBox -from os import path as osp from mmdet3d.core import bbox3d2result, box3d_multiclass_nms, xywhr2xyxyr from mmdet.datasets import DATASETS, CocoDataset @@ -44,8 +45,9 @@ class NuScenesMonoDataset(CocoDataset): - 'Camera': Box in camera coordinates. eval_version (str, optional): Configuration version of evaluation. Defaults to 'detection_cvpr_2019'. - use_valid_flag (bool): Whether to use `use_valid_flag` key in the info - file as mask to filter gt_boxes and gt_names. Defaults to False. + use_valid_flag (bool, optional): Whether to use `use_valid_flag` key + in the info file as mask to filter gt_boxes and gt_names. + Defaults to False. version (str, optional): Dataset version. Defaults to 'v1.0-trainval'. """ CLASSES = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle', @@ -140,8 +142,8 @@ def _parse_ann_info(self, img_info, ann_info): ann_info (list[dict]): Annotation info of an image. Returns: - dict: A dict containing the following keys: bboxes, labels, \ - gt_bboxes_3d, gt_labels_3d, attr_labels, centers2d, \ + dict: A dict containing the following keys: bboxes, labels, + gt_bboxes_3d, gt_labels_3d, attr_labels, centers2d, depths, bboxes_ignore, masks, seg_map """ gt_bboxes = [] @@ -394,10 +396,11 @@ def _evaluate_single(self, Args: result_path (str): Path of the result file. - logger (logging.Logger | str | None): Logger used for printing + logger (logging.Logger | str, optional): Logger used for printing related information during evaluation. Default: None. - metric (str): Metric name used for evaluation. Default: 'bbox'. - result_name (str): Result name in the metric prefix. + metric (str, optional): Metric name used for evaluation. + Default: 'bbox'. + result_name (str, optional): Result name in the metric prefix. Default: 'img_bbox'. Returns: @@ -448,13 +451,13 @@ def format_results(self, results, jsonfile_prefix=None, **kwargs): Args: results (list[tuple | numpy.ndarray]): Testing results of the dataset. - jsonfile_prefix (str | None): The prefix of json files. It includes + jsonfile_prefix (str): The prefix of json files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. Returns: - tuple: (result_files, tmp_dir), result_files is a dict containing \ - the json filepaths, tmp_dir is the temporal directory created \ + tuple: (result_files, tmp_dir), result_files is a dict containing + the json filepaths, tmp_dir is the temporal directory created for saving json files when jsonfile_prefix is not specified. """ assert isinstance(results, list), 'results must be a list' @@ -504,15 +507,18 @@ def evaluate(self, Args: results (list[dict]): Testing results of the dataset. - metric (str | list[str]): Metrics to be evaluated. - logger (logging.Logger | str | None): Logger used for printing + metric (str | list[str], optional): Metrics to be evaluated. + Default: 'bbox'. + logger (logging.Logger | str, optional): Logger used for printing related information during evaluation. Default: None. - jsonfile_prefix (str | None): The prefix of json files. It includes + jsonfile_prefix (str): The prefix of json files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. - show (bool): Whether to visualize. + result_names (list[str], optional): Result names in the + metric prefix. Default: ['img_bbox']. + show (bool, optional): Whether to visualize. Default: False. - out_dir (str): Path to save the visualization results. + out_dir (str, optional): Path to save the visualization results. Default: None. pipeline (list[dict], optional): raw data loading for showing. Default: None. @@ -535,7 +541,7 @@ def evaluate(self, if tmp_dir is not None: tmp_dir.cleanup() - if show: + if show or out_dir: self.show(results, out_dir, pipeline=pipeline) return results_dict @@ -576,7 +582,7 @@ def _get_pipeline(self, pipeline): """Get data loading pipeline in self.show/evaluate function. Args: - pipeline (list[dict] | None): Input pipeline. If None is given, \ + pipeline (list[dict]): Input pipeline. If None is given, get from self.pipeline. """ if pipeline is None: @@ -601,13 +607,14 @@ def _build_default_pipeline(self): ] return Compose(pipeline) - def show(self, results, out_dir, show=True, pipeline=None): + def show(self, results, out_dir, show=False, pipeline=None): """Results visualization. Args: results (list[dict]): List of bounding boxes results. out_dir (str): Output directory of visualization result. - show (bool): Visualize the results online. + show (bool): Whether to visualize the results online. + Default: False. pipeline (list[dict], optional): raw data loading for showing. Default: None. """ @@ -696,7 +703,7 @@ def cam_nusc_box_to_global(info, boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes. classes (list[str]): Mapped classes in the evaluation. eval_configs (object): Evaluation configuration object. - eval_version (str): Evaluation version. + eval_version (str, optional): Evaluation version. Default: 'detection_cvpr_2019' Returns: @@ -736,7 +743,7 @@ def global_nusc_box_to_cam(info, boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes. classes (list[str]): Mapped classes in the evaluation. eval_configs (object): Evaluation configuration object. - eval_version (str): Evaluation version. + eval_version (str, optional): Evaluation version. Default: 'detection_cvpr_2019' Returns: @@ -769,7 +776,7 @@ def nusc_box_to_cam_box3d(boxes): boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes. Returns: - tuple (:obj:`CameraInstance3DBoxes` | torch.Tensor | torch.Tensor): \ + tuple (:obj:`CameraInstance3DBoxes` | torch.Tensor | torch.Tensor): Converted 3D bounding boxes, scores and labels. """ locs = torch.Tensor([b.center for b in boxes]).view(-1, 3) diff --git a/mmdet3d/datasets/pipelines/__init__.py b/mmdet3d/datasets/pipelines/__init__.py index 68da65a0bf..d611184986 100644 --- a/mmdet3d/datasets/pipelines/__init__.py +++ b/mmdet3d/datasets/pipelines/__init__.py @@ -3,17 +3,19 @@ from .dbsampler import DataBaseSampler from .formating import Collect3D, DefaultFormatBundle, DefaultFormatBundle3D from .loading import (LoadAnnotations3D, LoadImageFromFileMono3D, - LoadMultiViewImageFromFiles, LoadPointsFromFile, - LoadPointsFromMultiSweeps, NormalizePointsColor, - PointSegClassMapping) + LoadMultiViewImageFromFiles, LoadPointsFromDict, + LoadPointsFromFile, LoadPointsFromMultiSweeps, + NormalizePointsColor, PointSegClassMapping) from .test_time_aug import MultiScaleFlipAug3D -from .transforms_3d import (BackgroundPointsFilter, GlobalAlignment, - GlobalRotScaleTrans, IndoorPatchPointSample, - IndoorPointSample, ObjectNameFilter, ObjectNoise, - ObjectRangeFilter, ObjectSample, PointSample, - PointShuffle, PointsRangeFilter, - RandomDropPointsColor, RandomFlip3D, - RandomJitterPoints, VoxelBasedPointSampler) +# yapf: disable +from .transforms_3d import (AffineResize, BackgroundPointsFilter, + GlobalAlignment, GlobalRotScaleTrans, + IndoorPatchPointSample, IndoorPointSample, + ObjectNameFilter, ObjectNoise, ObjectRangeFilter, + ObjectSample, PointSample, PointShuffle, + PointsRangeFilter, RandomDropPointsColor, + RandomFlip3D, RandomJitterPoints, RandomShiftScale, + VoxelBasedPointSampler) __all__ = [ 'ObjectSample', 'RandomFlip3D', 'ObjectNoise', 'GlobalRotScaleTrans', @@ -25,5 +27,6 @@ 'LoadPointsFromMultiSweeps', 'BackgroundPointsFilter', 'VoxelBasedPointSampler', 'GlobalAlignment', 'IndoorPatchPointSample', 'LoadImageFromFileMono3D', 'ObjectNameFilter', 'RandomDropPointsColor', - 'RandomJitterPoints' + 'RandomJitterPoints', 'AffineResize', 'RandomShiftScale', + 'LoadPointsFromDict' ] diff --git a/mmdet3d/datasets/pipelines/data_augment_utils.py b/mmdet3d/datasets/pipelines/data_augment_utils.py index 23b6aada74..21be3c06fb 100644 --- a/mmdet3d/datasets/pipelines/data_augment_utils.py +++ b/mmdet3d/datasets/pipelines/data_augment_utils.py @@ -1,8 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. +import warnings + import numba import numpy as np -import warnings -from numba.errors import NumbaPerformanceWarning +from numba.core.errors import NumbaPerformanceWarning from mmdet3d.core.bbox import box_np_ops @@ -21,8 +22,8 @@ def _rotation_box2d_jit_(corners, angle, rot_mat_T): rot_sin = np.sin(angle) rot_cos = np.cos(angle) rot_mat_T[0, 0] = rot_cos - rot_mat_T[0, 1] = -rot_sin - rot_mat_T[1, 0] = rot_sin + rot_mat_T[0, 1] = rot_sin + rot_mat_T[1, 0] = -rot_sin rot_mat_T[1, 1] = rot_cos corners[:] = corners @ rot_mat_T @@ -34,8 +35,8 @@ def box_collision_test(boxes, qboxes, clockwise=True): Args: boxes (np.ndarray): Corners of current boxes. qboxes (np.ndarray): Boxes to be avoid colliding. - clockwise (bool): Whether the corners are in clockwise order. - Default: True. + clockwise (bool, optional): Whether the corners are in + clockwise order. Default: True. """ N = boxes.shape[0] K = qboxes.shape[0] @@ -211,8 +212,8 @@ def noise_per_box_v2_(boxes, valid_mask, loc_noises, rot_noises, rot_sin = np.sin(current_box[0, -1]) rot_cos = np.cos(current_box[0, -1]) rot_mat_T[0, 0] = rot_cos - rot_mat_T[0, 1] = -rot_sin - rot_mat_T[1, 0] = rot_sin + rot_mat_T[0, 1] = rot_sin + rot_mat_T[1, 0] = -rot_sin rot_mat_T[1, 1] = rot_cos current_corners[:] = current_box[ 0, 2:4] * corners_norm @ rot_mat_T + current_box[0, :2] @@ -264,18 +265,18 @@ def _rotation_matrix_3d_(rot_mat_T, angle, axis): rot_mat_T[:] = np.eye(3) if axis == 1: rot_mat_T[0, 0] = rot_cos - rot_mat_T[0, 2] = -rot_sin - rot_mat_T[2, 0] = rot_sin + rot_mat_T[0, 2] = rot_sin + rot_mat_T[2, 0] = -rot_sin rot_mat_T[2, 2] = rot_cos elif axis == 2 or axis == -1: rot_mat_T[0, 0] = rot_cos - rot_mat_T[0, 1] = -rot_sin - rot_mat_T[1, 0] = rot_sin + rot_mat_T[0, 1] = rot_sin + rot_mat_T[1, 0] = -rot_sin rot_mat_T[1, 1] = rot_cos elif axis == 0: rot_mat_T[1, 1] = rot_cos - rot_mat_T[1, 2] = -rot_sin - rot_mat_T[2, 1] = rot_sin + rot_mat_T[1, 2] = rot_sin + rot_mat_T[2, 1] = -rot_sin rot_mat_T[2, 2] = rot_cos @@ -317,7 +318,7 @@ def box3d_transform_(boxes, loc_transform, rot_transform, valid_mask): boxes (np.ndarray): 3D boxes to be transformed. loc_transform (np.ndarray): Location transform to be applied. rot_transform (np.ndarray): Rotation transform to be applied. - valid_mask (np.ndarray | None): Mask to indicate which boxes are valid. + valid_mask (np.ndarray): Mask to indicate which boxes are valid. """ num_box = boxes.shape[0] for i in range(num_box): @@ -338,16 +339,17 @@ def noise_per_object_v3_(gt_boxes, Args: gt_boxes (np.ndarray): Ground truth boxes with shape (N, 7). - points (np.ndarray | None): Input point cloud with shape (M, 4). - Default: None. - valid_mask (np.ndarray | None): Mask to indicate which boxes are valid. - Default: None. - rotation_perturb (float): Rotation perturbation. Default: pi / 4. - center_noise_std (float): Center noise standard deviation. + points (np.ndarray, optional): Input point cloud with + shape (M, 4). Default: None. + valid_mask (np.ndarray, optional): Mask to indicate which + boxes are valid. Default: None. + rotation_perturb (float, optional): Rotation perturbation. + Default: pi / 4. + center_noise_std (float, optional): Center noise standard deviation. Default: 1.0. - global_random_rot_range (float): Global random rotation range. - Default: pi/4. - num_try (int): Number of try. Default: 100. + global_random_rot_range (float, optional): Global random rotation + range. Default: pi/4. + num_try (int, optional): Number of try. Default: 100. """ num_boxes = gt_boxes.shape[0] if not isinstance(rotation_perturb, (list, tuple, np.ndarray)): diff --git a/mmdet3d/datasets/pipelines/dbsampler.py b/mmdet3d/datasets/pipelines/dbsampler.py index 8a2455532e..f0a7074441 100644 --- a/mmdet3d/datasets/pipelines/dbsampler.py +++ b/mmdet3d/datasets/pipelines/dbsampler.py @@ -1,8 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. import copy +import os + import mmcv import numpy as np -import os from mmdet3d.core.bbox import box_np_ops from mmdet3d.datasets.pipelines import data_augment_utils @@ -15,10 +16,10 @@ class BatchSampler: Args: sample_list (list[dict]): List of samples. - name (str | None): The category of samples. Default: None. - epoch (int | None): Sampling epoch. Default: None. - shuffle (bool): Whether to shuffle indices. Default: False. - drop_reminder (bool): Drop reminder. Default: False. + name (str, optional): The category of samples. Default: None. + epoch (int, optional): Sampling epoch. Default: None. + shuffle (bool, optional): Whether to shuffle indices. Default: False. + drop_reminder (bool, optional): Drop reminder. Default: False. """ def __init__(self, @@ -87,9 +88,9 @@ class DataBaseSampler(object): rate (float): Rate of actual sampled over maximum sampled number. prepare (dict): Name of preparation functions and the input value. sample_groups (dict): Sampled classes and numbers. - classes (list[str]): List of classes. Default: None. - points_loader(dict): Config of points loader. Default: dict( - type='LoadPointsFromFile', load_dim=4, use_dim=[0,1,2,3]) + classes (list[str], optional): List of classes. Default: None. + points_loader(dict, optional): Config of points loader. Default: + dict(type='LoadPointsFromFile', load_dim=4, use_dim=[0,1,2,3]) """ def __init__(self, @@ -188,7 +189,7 @@ def filter_by_min_points(db_infos, min_gt_points_dict): db_infos[name] = filtered_infos return db_infos - def sample_all(self, gt_bboxes, gt_labels, img=None): + def sample_all(self, gt_bboxes, gt_labels, img=None, ground_plane=None): """Sampling all categories of bboxes. Args: @@ -198,9 +199,9 @@ def sample_all(self, gt_bboxes, gt_labels, img=None): Returns: dict: Dict of sampled 'pseudo ground truths'. - - gt_labels_3d (np.ndarray): ground truths labels \ + - gt_labels_3d (np.ndarray): ground truths labels of sampled objects. - - gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): \ + - gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): sampled ground truth 3D bounding boxes - points (np.ndarray): sampled points - group_ids (np.ndarray): ids of sampled ground truths @@ -263,6 +264,15 @@ def sample_all(self, gt_bboxes, gt_labels, img=None): gt_labels = np.array([self.cat2label[s['name']] for s in sampled], dtype=np.long) + + if ground_plane is not None: + xyz = sampled_gt_bboxes[:, :3] + dz = (ground_plane[:3][None, :] * + xyz).sum(-1) + ground_plane[3] + sampled_gt_bboxes[:, 2] -= dz + for i, s_points in enumerate(s_points_list): + s_points.tensor[:, 2].sub_(dz[i]) + ret = { 'gt_labels_3d': gt_labels, diff --git a/mmdet3d/datasets/pipelines/formating.py b/mmdet3d/datasets/pipelines/formating.py index c303de8338..8f9c96f989 100644 --- a/mmdet3d/datasets/pipelines/formating.py +++ b/mmdet3d/datasets/pipelines/formating.py @@ -24,7 +24,7 @@ class DefaultFormatBundle(object): - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer - gt_labels: (1)to tensor, (2)to DataContainer - gt_masks: (1)to tensor, (2)to DataContainer (cpu_only=True) - - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor, \ + - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor, (3)to DataContainer (stack=True) """ @@ -92,8 +92,8 @@ class Collect3D(object): The "img_meta" item is always populated. The contents of the "img_meta" dictionary depends on "meta_keys". By default this includes: - - 'img_shape': shape of the image input to the network as a tuple \ - (h, w, c). Note that images may be zero padded on the \ + - 'img_shape': shape of the image input to the network as a tuple + (h, w, c). Note that images may be zero padded on the bottom/right if the batch tensor is larger than this shape. - 'scale_factor': a float indicating the preprocessing scale - 'flip': a boolean indicating if image flip transform was used @@ -103,9 +103,9 @@ class Collect3D(object): - 'lidar2img': transform from lidar to image - 'depth2img': transform from depth to image - 'cam2img': transform from camera to image - - 'pcd_horizontal_flip': a boolean indicating if point cloud is \ + - 'pcd_horizontal_flip': a boolean indicating if point cloud is flipped horizontally - - 'pcd_vertical_flip': a boolean indicating if point cloud is \ + - 'pcd_vertical_flip': a boolean indicating if point cloud is flipped vertically - 'box_mode_3d': 3D box mode - 'box_type_3d': 3D box type @@ -130,15 +130,16 @@ class Collect3D(object): 'sample_idx', 'pcd_scale_factor', 'pcd_rotation', 'pts_filename') """ - def __init__(self, - keys, - meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img', - 'depth2img', 'cam2img', 'pad_shape', - 'scale_factor', 'flip', 'pcd_horizontal_flip', - 'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d', - 'img_norm_cfg', 'pcd_trans', 'sample_idx', - 'pcd_scale_factor', 'pcd_rotation', 'pts_filename', - 'transformation_3d_flow')): + def __init__( + self, + keys, + meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img', + 'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip', + 'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d', + 'box_type_3d', 'img_norm_cfg', 'pcd_trans', 'sample_idx', + 'pcd_scale_factor', 'pcd_rotation', 'pcd_rotation_angle', + 'pts_filename', 'transformation_3d_flow', 'trans_mat', + 'affine_aug')): self.keys = keys self.meta_keys = meta_keys diff --git a/mmdet3d/datasets/pipelines/loading.py b/mmdet3d/datasets/pipelines/loading.py index 58f5a86fa3..f3ff0776a6 100644 --- a/mmdet3d/datasets/pipelines/loading.py +++ b/mmdet3d/datasets/pipelines/loading.py @@ -14,9 +14,10 @@ class LoadMultiViewImageFromFiles(object): Expects results['img_filename'] to be a list of filenames. Args: - to_float32 (bool): Whether to convert the img to float32. + to_float32 (bool, optional): Whether to convert the img to float32. Defaults to False. - color_type (str): Color type of the file. Defaults to 'unchanged'. + color_type (str, optional): Color type of the file. + Defaults to 'unchanged'. """ def __init__(self, to_float32=False, color_type='unchanged'): @@ -30,7 +31,7 @@ def __call__(self, results): results (dict): Result dict containing multi-view image filenames. Returns: - dict: The result dict containing the multi-view image data. \ + dict: The result dict containing the multi-view image data. Added keys and values are described below. - filename (str): Multi-view image filenames. @@ -48,7 +49,7 @@ def __call__(self, results): if self.to_float32: img = img.astype(np.float32) results['filename'] = filename - # unravel to list, see `DefaultFormatBundle` in formating.py + # unravel to list, see `DefaultFormatBundle` in formatting.py # which will transpose each image separately and then stack into array results['img'] = [img[..., i] for i in range(img.shape[-1])] results['img_shape'] = img.shape @@ -77,7 +78,7 @@ class LoadImageFromFileMono3D(LoadImageFromFile): detection, additional camera parameters need to be loaded. Args: - kwargs (dict): Arguments are the same as those in \ + kwargs (dict): Arguments are the same as those in :class:`LoadImageFromFile`. """ @@ -102,17 +103,20 @@ class LoadPointsFromMultiSweeps(object): This is usually used for nuScenes dataset to utilize previous sweeps. Args: - sweeps_num (int): Number of sweeps. Defaults to 10. - load_dim (int): Dimension number of the loaded points. Defaults to 5. - use_dim (list[int]): Which dimension to use. Defaults to [0, 1, 2, 4]. - file_client_args (dict): Config dict of file clients, refer to + sweeps_num (int, optional): Number of sweeps. Defaults to 10. + load_dim (int, optional): Dimension number of the loaded points. + Defaults to 5. + use_dim (list[int], optional): Which dimension to use. + Defaults to [0, 1, 2, 4]. + file_client_args (dict, optional): Config dict of file clients, + refer to https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py for more details. Defaults to dict(backend='disk'). - pad_empty_sweeps (bool): Whether to repeat keyframe when + pad_empty_sweeps (bool, optional): Whether to repeat keyframe when sweeps is empty. Defaults to False. - remove_close (bool): Whether to remove close points. + remove_close (bool, optional): Whether to remove close points. Defaults to False. - test_mode (bool): If test_model=True used for testing, it will not + test_mode (bool, optional): If `test_mode=True`, it will not randomly sample sweeps but select the nearest N frames. Defaults to False. """ @@ -161,7 +165,7 @@ def _remove_close(self, points, radius=1.0): Args: points (np.ndarray | :obj:`BasePoints`): Sweep points. - radius (float): Radius below which points are removed. + radius (float, optional): Radius below which points are removed. Defaults to 1.0. Returns: @@ -182,14 +186,14 @@ def __call__(self, results): """Call function to load multi-sweep point clouds from files. Args: - results (dict): Result dict containing multi-sweep point cloud \ + results (dict): Result dict containing multi-sweep point cloud filenames. Returns: - dict: The result dict containing the multi-sweep points data. \ + dict: The result dict containing the multi-sweep points data. Added key and value are described below. - - points (np.ndarray | :obj:`BasePoints`): Multi-sweep point \ + - points (np.ndarray | :obj:`BasePoints`): Multi-sweep point cloud arrays. """ points = results['points'] @@ -243,8 +247,8 @@ class PointSegClassMapping(object): Args: valid_cat_ids (tuple[int]): A tuple of valid category. - max_cat_id (int): The max possible cat_id in input segmentation mask. - Defaults to 40. + max_cat_id (int, optional): The max possible cat_id in input + segmentation mask. Defaults to 40. """ def __init__(self, valid_cat_ids, max_cat_id=40): @@ -268,7 +272,7 @@ def __call__(self, results): results (dict): Result dict containing point semantic masks. Returns: - dict: The result dict containing the mapped category ids. \ + dict: The result dict containing the mapped category ids. Updated key and value are described below. - pts_semantic_mask (np.ndarray): Mapped semantic masks. @@ -307,7 +311,7 @@ def __call__(self, results): results (dict): Result dict containing point clouds data. Returns: - dict: The result dict containing the normalized points. \ + dict: The result dict containing the normalized points. Updated key and value are described below. - points (:obj:`BasePoints`): Points after color normalization. @@ -334,7 +338,7 @@ def __repr__(self): class LoadPointsFromFile(object): """Load Points From File. - Load sunrgbd and scannet points from file. + Load points from file. Args: coord_type (str): The type of coordinates of points cloud. @@ -342,14 +346,17 @@ class LoadPointsFromFile(object): - 'LIDAR': Points in LiDAR coordinates. - 'DEPTH': Points in depth coordinates, usually for indoor dataset. - 'CAMERA': Points in camera coordinates. - load_dim (int): The dimension of the loaded points. + load_dim (int, optional): The dimension of the loaded points. Defaults to 6. - use_dim (list[int]): Which dimensions of the points to be used. + use_dim (list[int], optional): Which dimensions of the points to use. Defaults to [0, 1, 2]. For KITTI dataset, set use_dim=4 or use_dim=[0, 1, 2, 3] to use the intensity dimension. - shift_height (bool): Whether to use shifted height. Defaults to False. - use_color (bool): Whether to use color features. Defaults to False. - file_client_args (dict): Config dict of file clients, refer to + shift_height (bool, optional): Whether to use shifted height. + Defaults to False. + use_color (bool, optional): Whether to use color features. + Defaults to False. + file_client_args (dict, optional): Config dict of file clients, + refer to https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py for more details. Defaults to dict(backend='disk'). """ @@ -405,7 +412,7 @@ def __call__(self, results): results (dict): Result dict containing point clouds data. Returns: - dict: The result dict containing the point clouds data. \ + dict: The result dict containing the point clouds data. Added key and value are described below. - points (:obj:`BasePoints`): Point clouds data. @@ -453,6 +460,15 @@ def __repr__(self): return repr_str +@PIPELINES.register_module() +class LoadPointsFromDict(LoadPointsFromFile): + """Load Points From Dict.""" + + def __call__(self, results): + assert 'points' in results + return results + + @PIPELINES.register_module() class LoadAnnotations3D(LoadAnnotations): """Load Annotations3D. diff --git a/mmdet3d/datasets/pipelines/test_time_aug.py b/mmdet3d/datasets/pipelines/test_time_aug.py index 790476c58b..04bf8970e0 100644 --- a/mmdet3d/datasets/pipelines/test_time_aug.py +++ b/mmdet3d/datasets/pipelines/test_time_aug.py @@ -1,8 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. -import mmcv import warnings from copy import deepcopy +import mmcv + from mmdet.datasets.builder import PIPELINES from mmdet.datasets.pipelines import Compose @@ -16,18 +17,19 @@ class MultiScaleFlipAug3D(object): img_scale (tuple | list[tuple]: Images scales for resizing. pts_scale_ratio (float | list[float]): Points scale ratios for resizing. - flip (bool): Whether apply flip augmentation. Defaults to False. - flip_direction (str | list[str]): Flip augmentation directions - for images, options are "horizontal" and "vertical". + flip (bool, optional): Whether apply flip augmentation. + Defaults to False. + flip_direction (str | list[str], optional): Flip augmentation + directions for images, options are "horizontal" and "vertical". If flip_direction is list, multiple flip augmentations will be applied. It has no effect when ``flip == False``. Defaults to "horizontal". - pcd_horizontal_flip (bool): Whether apply horizontal flip augmentation - to point cloud. Defaults to True. Note that it works only when - 'flip' is turned on. - pcd_vertical_flip (bool): Whether apply vertical flip augmentation - to point cloud. Defaults to True. Note that it works only when - 'flip' is turned on. + pcd_horizontal_flip (bool, optional): Whether apply horizontal + flip augmentation to point cloud. Defaults to True. + Note that it works only when 'flip' is turned on. + pcd_vertical_flip (bool, optional): Whether apply vertical flip + augmentation to point cloud. Defaults to True. + Note that it works only when 'flip' is turned on. """ def __init__(self, @@ -70,7 +72,7 @@ def __call__(self, results): results (dict): Result dict contains the data to augment. Returns: - dict: The result dict contains the data that is augmented with \ + dict: The result dict contains the data that is augmented with different scales and flips. """ aug_data = [] diff --git a/mmdet3d/datasets/pipelines/transforms_3d.py b/mmdet3d/datasets/pipelines/transforms_3d.py index dc34004d65..6269dc217b 100644 --- a/mmdet3d/datasets/pipelines/transforms_3d.py +++ b/mmdet3d/datasets/pipelines/transforms_3d.py @@ -1,6 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. -import numpy as np +import random import warnings + +import cv2 +import numpy as np from mmcv import is_tuple_of from mmcv.utils import build_from_cfg @@ -22,7 +25,7 @@ class RandomDropPointsColor(object): util/transform.py#L223>`_ for more details. Args: - drop_ratio (float): The probability of dropping point colors. + drop_ratio (float, optional): The probability of dropping point colors. Defaults to 0.2. """ @@ -38,7 +41,7 @@ def __call__(self, input_dict): input_dict (dict): Result dict from loading pipeline. Returns: - dict: Results after color dropping, \ + dict: Results after color dropping, 'points' key is updated in the result dict. """ points = input_dict['points'] @@ -105,10 +108,11 @@ def random_flip_data_3d(self, input_dict, direction='horizontal'): Args: input_dict (dict): Result dict from loading pipeline. - direction (str): Flip direction. Default: horizontal. + direction (str, optional): Flip direction. + Default: 'horizontal'. Returns: - dict: Flipped results, 'points', 'bbox3d_fields' keys are \ + dict: Flipped results, 'points', 'bbox3d_fields' keys are updated in the result dict. """ assert direction in ['horizontal', 'vertical'] @@ -141,15 +145,15 @@ def random_flip_data_3d(self, input_dict, direction='horizontal'): input_dict['cam2img'][0][2] = w - input_dict['cam2img'][0][2] def __call__(self, input_dict): - """Call function to flip points, values in the ``bbox3d_fields`` and \ + """Call function to flip points, values in the ``bbox3d_fields`` and also flip 2D image and its annotations. Args: input_dict (dict): Result dict from loading pipeline. Returns: - dict: Flipped results, 'flip', 'flip_direction', \ - 'pcd_horizontal_flip' and 'pcd_vertical_flip' keys are added \ + dict: Flipped results, 'flip', 'flip_direction', + 'pcd_horizontal_flip' and 'pcd_vertical_flip' keys are added into result dict. """ # flip 2D image and its annotations @@ -191,20 +195,20 @@ def __repr__(self): class RandomJitterPoints(object): """Randomly jitter point coordinates. - Different from the global translation in ``GlobalRotScaleTrans``, here we \ + Different from the global translation in ``GlobalRotScaleTrans``, here we apply different noises to each point in a scene. Args: jitter_std (list[float]): The standard deviation of jittering noise. - This applies random noise to all points in a 3D scene, which is \ - sampled from a gaussian distribution whose standard deviation is \ + This applies random noise to all points in a 3D scene, which is + sampled from a gaussian distribution whose standard deviation is set by ``jitter_std``. Defaults to [0.01, 0.01, 0.01] - clip_range (list[float] | None): Clip the randomly generated jitter \ + clip_range (list[float]): Clip the randomly generated jitter noise into this range. If None is given, don't perform clipping. Defaults to [-0.05, 0.05] Note: - This transform should only be used in point cloud segmentation tasks \ + This transform should only be used in point cloud segmentation tasks because we don't transform ground-truth bboxes accordingly. For similar transform in detection task, please refer to `ObjectNoise`. """ @@ -233,7 +237,7 @@ def __call__(self, input_dict): input_dict (dict): Result dict from loading pipeline. Returns: - dict: Results after adding noise to each point, \ + dict: Results after adding noise to each point, 'points' key is updated in the result dict. """ points = input_dict['points'] @@ -264,14 +268,17 @@ class ObjectSample(object): sample_2d (bool): Whether to also paste 2D image patch to the images This should be true when applying multi-modality cut-and-paste. Defaults to False. + use_ground_plane (bool): Whether to use gound plane to adjust the + 3D labels. """ - def __init__(self, db_sampler, sample_2d=False): + def __init__(self, db_sampler, sample_2d=False, use_ground_plane=False): self.sampler_cfg = db_sampler self.sample_2d = sample_2d if 'type' not in db_sampler.keys(): db_sampler['type'] = 'DataBaseSampler' self.db_sampler = build_from_cfg(db_sampler, OBJECTSAMPLERS) + self.use_ground_plane = use_ground_plane @staticmethod def remove_points_in_boxes(points, boxes): @@ -295,13 +302,18 @@ def __call__(self, input_dict): input_dict (dict): Result dict from loading pipeline. Returns: - dict: Results after object sampling augmentation, \ - 'points', 'gt_bboxes_3d', 'gt_labels_3d' keys are updated \ + dict: Results after object sampling augmentation, + 'points', 'gt_bboxes_3d', 'gt_labels_3d' keys are updated in the result dict. """ gt_bboxes_3d = input_dict['gt_bboxes_3d'] gt_labels_3d = input_dict['gt_labels_3d'] + if self.use_ground_plane and 'plane' in input_dict['ann_info']: + ground_plane = input_dict['ann_info']['plane'] + input_dict['plane'] = ground_plane + else: + ground_plane = None # change to float for blending operation points = input_dict['points'] if self.sample_2d: @@ -315,7 +327,10 @@ def __call__(self, input_dict): img=img) else: sampled_dict = self.db_sampler.sample_all( - gt_bboxes_3d.tensor.numpy(), gt_labels_3d, img=None) + gt_bboxes_3d.tensor.numpy(), + gt_labels_3d, + img=None, + ground_plane=ground_plane) if sampled_dict is not None: sampled_gt_bboxes_3d = sampled_dict['gt_bboxes_3d'] @@ -392,13 +407,13 @@ def __call__(self, input_dict): input_dict (dict): Result dict from loading pipeline. Returns: - dict: Results after adding noise to each object, \ + dict: Results after adding noise to each object, 'points', 'gt_bboxes_3d' keys are updated in the result dict. """ gt_bboxes_3d = input_dict['gt_bboxes_3d'] points = input_dict['points'] - # TODO: check this inplace function + # TODO: this is inplace operation numpy_box = gt_bboxes_3d.tensor.numpy() numpy_points = points.tensor.numpy() @@ -432,10 +447,10 @@ class GlobalAlignment(object): rotation_axis (int): Rotation axis for points and bboxes rotation. Note: - We do not record the applied rotation and translation as in \ - GlobalRotScaleTrans. Because usually, we do not need to reverse \ + We do not record the applied rotation and translation as in + GlobalRotScaleTrans. Because usually, we do not need to reverse the alignment step. - For example, ScanNet 3D detection task uses aligned ground-truth \ + For example, ScanNet 3D detection task uses aligned ground-truth bounding boxes for evaluation. """ @@ -487,7 +502,7 @@ def __call__(self, input_dict): input_dict (dict): Result dict from loading pipeline. Returns: - dict: Results after global alignment, 'points' and keys in \ + dict: Results after global alignment, 'points' and keys in input_dict['bbox3d_fields'] are updated in the result dict. """ assert 'axis_align_matrix' in input_dict['ann_info'].keys(), \ @@ -516,15 +531,15 @@ class GlobalRotScaleTrans(object): """Apply global rotation, scaling and translation to a 3D scene. Args: - rot_range (list[float]): Range of rotation angle. + rot_range (list[float], optional): Range of rotation angle. Defaults to [-0.78539816, 0.78539816] (close to [-pi/4, pi/4]). - scale_ratio_range (list[float]): Range of scale ratio. + scale_ratio_range (list[float], optional): Range of scale ratio. Defaults to [0.95, 1.05]. - translation_std (list[float]): The standard deviation of translation - noise. This applies random translation to a scene by a noise, which + translation_std (list[float], optional): The standard deviation of + translation noise applied to a scene, which is sampled from a gaussian distribution whose standard deviation is set by ``translation_std``. Defaults to [0, 0, 0] - shift_height (bool): Whether to shift height. + shift_height (bool, optional): Whether to shift height. (the fourth dimension of indoor points) when scaling. Defaults to False. """ @@ -563,8 +578,8 @@ def _trans_bbox_points(self, input_dict): input_dict (dict): Result dict from loading pipeline. Returns: - dict: Results after translation, 'points', 'pcd_trans' \ - and keys in input_dict['bbox3d_fields'] are updated \ + dict: Results after translation, 'points', 'pcd_trans' + and keys in input_dict['bbox3d_fields'] are updated in the result dict. """ translation_std = np.array(self.translation_std, dtype=np.float32) @@ -582,8 +597,8 @@ def _rot_bbox_points(self, input_dict): input_dict (dict): Result dict from loading pipeline. Returns: - dict: Results after rotation, 'points', 'pcd_rotation' \ - and keys in input_dict['bbox3d_fields'] are updated \ + dict: Results after rotation, 'points', 'pcd_rotation' + and keys in input_dict['bbox3d_fields'] are updated in the result dict. """ rotation = self.rot_range @@ -593,6 +608,7 @@ def _rot_bbox_points(self, input_dict): if len(input_dict['bbox3d_fields']) == 0: rot_mat_T = input_dict['points'].rotate(noise_rotation) input_dict['pcd_rotation'] = rot_mat_T + input_dict['pcd_rotation_angle'] = noise_rotation return # rotate points with bboxes @@ -602,6 +618,7 @@ def _rot_bbox_points(self, input_dict): noise_rotation, input_dict['points']) input_dict['points'] = points input_dict['pcd_rotation'] = rot_mat_T + input_dict['pcd_rotation_angle'] = noise_rotation def _scale_bbox_points(self, input_dict): """Private function to scale bounding boxes and points. @@ -610,7 +627,7 @@ def _scale_bbox_points(self, input_dict): input_dict (dict): Result dict from loading pipeline. Returns: - dict: Results after scaling, 'points'and keys in \ + dict: Results after scaling, 'points'and keys in input_dict['bbox3d_fields'] are updated in the result dict. """ scale = input_dict['pcd_scale_factor'] @@ -632,7 +649,7 @@ def _random_scale(self, input_dict): input_dict (dict): Result dict from loading pipeline. Returns: - dict: Results after scaling, 'pcd_scale_factor' are updated \ + dict: Results after scaling, 'pcd_scale_factor' are updated in the result dict. """ scale_factor = np.random.uniform(self.scale_ratio_range[0], @@ -640,7 +657,7 @@ def _random_scale(self, input_dict): input_dict['pcd_scale_factor'] = scale_factor def __call__(self, input_dict): - """Private function to rotate, scale and translate bounding boxes and \ + """Private function to rotate, scale and translate bounding boxes and points. Args: @@ -648,7 +665,7 @@ def __call__(self, input_dict): Returns: dict: Results after scaling, 'points', 'pcd_rotation', - 'pcd_scale_factor', 'pcd_trans' and keys in \ + 'pcd_scale_factor', 'pcd_trans' and keys in input_dict['bbox3d_fields'] are updated in the result dict. """ if 'transformation_3d_flow' not in input_dict: @@ -686,7 +703,7 @@ def __call__(self, input_dict): input_dict (dict): Result dict from loading pipeline. Returns: - dict: Results after filtering, 'points', 'pts_instance_mask' \ + dict: Results after filtering, 'points', 'pts_instance_mask' and 'pts_semantic_mask' keys are updated in the result dict. """ idx = input_dict['points'].shuffle() @@ -725,7 +742,7 @@ def __call__(self, input_dict): input_dict (dict): Result dict from loading pipeline. Returns: - dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d' \ + dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d' keys are updated in the result dict. """ # Check points instance type and initialise bev_range @@ -777,7 +794,7 @@ def __call__(self, input_dict): input_dict (dict): Result dict from loading pipeline. Returns: - dict: Results after filtering, 'points', 'pts_instance_mask' \ + dict: Results after filtering, 'points', 'pts_instance_mask' and 'pts_semantic_mask' keys are updated in the result dict. """ points = input_dict['points'] @@ -823,7 +840,7 @@ def __call__(self, input_dict): input_dict (dict): Result dict from loading pipeline. Returns: - dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d' \ + dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d' keys are updated in the result dict. """ gt_labels_3d = input_dict['gt_labels_3d'] @@ -891,8 +908,8 @@ def _points_random_sampling(self, if sample_range is not None and not replace: # Only sampling the near points when len(points) >= num_samples depth = np.linalg.norm(points.tensor, axis=1) - far_inds = np.where(depth > sample_range)[0] - near_inds = np.where(depth <= sample_range)[0] + far_inds = np.where(depth >= sample_range)[0] + near_inds = np.where(depth < sample_range)[0] # in case there are too many far points if len(far_inds) > num_samples: far_inds = np.random.choice( @@ -915,7 +932,7 @@ def __call__(self, results): Args: input_dict (dict): Result dict from loading pipeline. Returns: - dict: Results after sampling, 'points', 'pts_instance_mask' \ + dict: Results after sampling, 'points', 'pts_instance_mask' and 'pts_semantic_mask' keys are updated in the result dict. """ points = results['points'] @@ -996,10 +1013,10 @@ class IndoorPatchPointSample(object): additional features. Defaults to False. num_try (int, optional): Number of times to try if the patch selected is invalid. Defaults to 10. - enlarge_size (float | None, optional): Enlarge the sampled patch to + enlarge_size (float, optional): Enlarge the sampled patch to [-block_size / 2 - enlarge_size, block_size / 2 + enlarge_size] as an augmentation. If None, set it as 0. Defaults to 0.2. - min_unique_num (int | None, optional): Minimum number of unique points + min_unique_num (int, optional): Minimum number of unique points the sampled patch should contain. If None, use PointNet++'s method to judge uniqueness. Defaults to None. eps (float, optional): A value added to patch boundary to guarantee @@ -1040,7 +1057,7 @@ def _input_generation(self, coords, patch_center, coord_max, attributes, attribute_dims, point_type): """Generating model input. - Generate input by subtracting patch center and adding additional \ + Generate input by subtracting patch center and adding additional features. Currently support colors and normalized xyz as features. Args: @@ -1184,7 +1201,7 @@ def __call__(self, results): input_dict (dict): Result dict from loading pipeline. Returns: - dict: Results after sampling, 'points', 'pts_instance_mask' \ + dict: Results after sampling, 'points', 'pts_instance_mask' and 'pts_semantic_mask' keys are updated in the result dict. """ points = results['points'] @@ -1244,7 +1261,7 @@ def __call__(self, input_dict): input_dict (dict): Result dict from loading pipeline. Returns: - dict: Results after filtering, 'points', 'pts_instance_mask' \ + dict: Results after filtering, 'points', 'pts_instance_mask' and 'pts_semantic_mask' keys are updated in the result dict. """ points = input_dict['points'] @@ -1342,7 +1359,7 @@ def __call__(self, results): input_dict (dict): Result dict from loading pipeline. Returns: - dict: Results after sampling, 'points', 'pts_instance_mask' \ + dict: Results after sampling, 'points', 'pts_instance_mask' and 'pts_semantic_mask' keys are updated in the result dict. """ points = results['points'] @@ -1423,3 +1440,258 @@ def _auto_indent(repr_str, indent): repr_str += ' ' * indent + 'prev_voxel_generator=\n' repr_str += f'{_auto_indent(repr(self.prev_voxel_generator), 8)})' return repr_str + + +@PIPELINES.register_module() +class AffineResize(object): + """Get the affine transform matrices to the target size. + + Different from :class:`RandomAffine` in MMDetection, this class can + calculate the affine transform matrices while resizing the input image + to a fixed size. The affine transform matrices include: 1) matrix + transforming original image to the network input image size. 2) matrix + transforming original image to the network output feature map size. + + Args: + img_scale (tuple): Images scales for resizing. + down_ratio (int): The down ratio of feature map. + Actually the arg should be >= 1. + bbox_clip_border (bool, optional): Whether clip the objects + outside the border of the image. Defaults to True. + """ + + def __init__(self, img_scale, down_ratio, bbox_clip_border=True): + + self.img_scale = img_scale + self.down_ratio = down_ratio + self.bbox_clip_border = bbox_clip_border + + def __call__(self, results): + """Call function to do affine transform to input image and labels. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Results after affine resize, 'affine_aug', 'trans_mat' + keys are added in the result dict. + """ + # The results have gone through RandomShiftScale before AffineResize + if 'center' not in results: + img = results['img'] + height, width = img.shape[:2] + center = np.array([width / 2, height / 2], dtype=np.float32) + size = np.array([width, height], dtype=np.float32) + results['affine_aug'] = False + else: + # The results did not go through RandomShiftScale before + # AffineResize + img = results['img'] + center = results['center'] + size = results['size'] + + trans_affine = self._get_transform_matrix(center, size, self.img_scale) + + img = cv2.warpAffine(img, trans_affine[:2, :], self.img_scale) + + if isinstance(self.down_ratio, tuple): + trans_mat = [ + self._get_transform_matrix( + center, size, + (self.img_scale[0] // ratio, self.img_scale[1] // ratio)) + for ratio in self.down_ratio + ] # (3, 3) + else: + trans_mat = self._get_transform_matrix( + center, size, (self.img_scale[0] // self.down_ratio, + self.img_scale[1] // self.down_ratio)) + + results['img'] = img + results['img_shape'] = img.shape + results['pad_shape'] = img.shape + results['trans_mat'] = trans_mat + + self._affine_bboxes(results, trans_affine) + + if 'centers2d' in results: + centers2d = self._affine_transform(results['centers2d'], + trans_affine) + valid_index = (centers2d[:, 0] > + 0) & (centers2d[:, 0] < + self.img_scale[0]) & (centers2d[:, 1] > 0) & ( + centers2d[:, 1] < self.img_scale[1]) + results['centers2d'] = centers2d[valid_index] + + for key in results.get('bbox_fields', []): + if key in ['gt_bboxes']: + results[key] = results[key][valid_index] + if 'gt_labels' in results: + results['gt_labels'] = results['gt_labels'][ + valid_index] + if 'gt_masks' in results: + raise NotImplementedError( + 'AffineResize only supports bbox.') + + for key in results.get('bbox3d_fields', []): + if key in ['gt_bboxes_3d']: + results[key].tensor = results[key].tensor[valid_index] + if 'gt_labels_3d' in results: + results['gt_labels_3d'] = results['gt_labels_3d'][ + valid_index] + + results['depths'] = results['depths'][valid_index] + + return results + + def _affine_bboxes(self, results, matrix): + """Affine transform bboxes to input image. + + Args: + results (dict): Result dict from loading pipeline. + matrix (np.ndarray): Matrix transforming original + image to the network input image size. + shape: (3, 3) + """ + + for key in results.get('bbox_fields', []): + bboxes = results[key] + bboxes[:, :2] = self._affine_transform(bboxes[:, :2], matrix) + bboxes[:, 2:] = self._affine_transform(bboxes[:, 2:], matrix) + if self.bbox_clip_border: + bboxes[:, + [0, 2]] = bboxes[:, + [0, 2]].clip(0, self.img_scale[0] - 1) + bboxes[:, + [1, 3]] = bboxes[:, + [1, 3]].clip(0, self.img_scale[1] - 1) + results[key] = bboxes + + def _affine_transform(self, points, matrix): + """Affine transform bbox points to input image. + + Args: + points (np.ndarray): Points to be transformed. + shape: (N, 2) + matrix (np.ndarray): Affine transform matrix. + shape: (3, 3) + + Returns: + np.ndarray: Transformed points. + """ + num_points = points.shape[0] + hom_points_2d = np.concatenate((points, np.ones((num_points, 1))), + axis=1) + hom_points_2d = hom_points_2d.T + affined_points = np.matmul(matrix, hom_points_2d).T + return affined_points[:, :2] + + def _get_transform_matrix(self, center, scale, output_scale): + """Get affine transform matrix. + + Args: + center (tuple): Center of current image. + scale (tuple): Scale of current image. + output_scale (tuple[float]): The transform target image scales. + + Returns: + np.ndarray: Affine transform matrix. + """ + # TODO: further add rot and shift here. + src_w = scale[0] + dst_w = output_scale[0] + dst_h = output_scale[1] + + src_dir = np.array([0, src_w * -0.5]) + dst_dir = np.array([0, dst_w * -0.5]) + + src = np.zeros((3, 2), dtype=np.float32) + dst = np.zeros((3, 2), dtype=np.float32) + src[0, :] = center + src[1, :] = center + src_dir + dst[0, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir + + src[2, :] = self._get_ref_point(src[0, :], src[1, :]) + dst[2, :] = self._get_ref_point(dst[0, :], dst[1, :]) + + get_matrix = cv2.getAffineTransform(src, dst) + + matrix = np.concatenate((get_matrix, [[0., 0., 1.]])) + + return matrix.astype(np.float32) + + def _get_ref_point(self, ref_point1, ref_point2): + """Get reference point to calculate affine transform matrix. + + While using opencv to calculate the affine matrix, we need at least + three corresponding points separately on original image and target + image. Here we use two points to get the the third reference point. + """ + d = ref_point1 - ref_point2 + ref_point3 = ref_point2 + np.array([-d[1], d[0]]) + return ref_point3 + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(img_scale={self.img_scale}, ' + repr_str += f'down_ratio={self.down_ratio}) ' + return repr_str + + +@PIPELINES.register_module() +class RandomShiftScale(object): + """Random shift scale. + + Different from the normal shift and scale function, it doesn't + directly shift or scale image. It can record the shift and scale + infos into loading pipelines. It's designed to be used with + AffineResize together. + + Args: + shift_scale (tuple[float]): Shift and scale range. + aug_prob (float): The shifting and scaling probability. + """ + + def __init__(self, shift_scale, aug_prob): + + self.shift_scale = shift_scale + self.aug_prob = aug_prob + + def __call__(self, results): + """Call function to record random shift and scale infos. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Results after random shift and scale, 'center', 'size' + and 'affine_aug' keys are added in the result dict. + """ + img = results['img'] + + height, width = img.shape[:2] + + center = np.array([width / 2, height / 2], dtype=np.float32) + size = np.array([width, height], dtype=np.float32) + + if random.random() < self.aug_prob: + shift, scale = self.shift_scale[0], self.shift_scale[1] + shift_ranges = np.arange(-shift, shift + 0.1, 0.1) + center[0] += size[0] * random.choice(shift_ranges) + center[1] += size[1] * random.choice(shift_ranges) + scale_ranges = np.arange(1 - scale, 1 + scale + 0.1, 0.1) + size *= random.choice(scale_ranges) + results['affine_aug'] = True + else: + results['affine_aug'] = False + + results['center'] = center + results['size'] = size + + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(shift_scale={self.shift_scale}, ' + repr_str += f'aug_prob={self.aug_prob}) ' + return repr_str diff --git a/mmdet3d/datasets/s3dis_dataset.py b/mmdet3d/datasets/s3dis_dataset.py index 0955fb7b92..49845ab1fa 100644 --- a/mmdet3d/datasets/s3dis_dataset.py +++ b/mmdet3d/datasets/s3dis_dataset.py @@ -1,7 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. -import numpy as np from os import path as osp +import numpy as np + from mmdet3d.core import show_seg_result from mmdet3d.core.bbox import DepthInstance3DBoxes from mmdet.datasets import DATASETS diff --git a/mmdet3d/datasets/scannet_dataset.py b/mmdet3d/datasets/scannet_dataset.py index 7d15b0bacc..a3b2c74d28 100644 --- a/mmdet3d/datasets/scannet_dataset.py +++ b/mmdet3d/datasets/scannet_dataset.py @@ -1,9 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. -import numpy as np import tempfile import warnings from os import path as osp +import numpy as np + from mmdet3d.core import show_result, show_seg_result from mmdet3d.core.bbox import DepthInstance3DBoxes from mmdet.datasets import DATASETS @@ -78,13 +79,13 @@ def get_data_info(self, index): index (int): Index of the sample data to get. Returns: - dict: Data information that will be passed to the data \ + dict: Data information that will be passed to the data preprocessing pipelines. It includes the following keys: - sample_idx (str): Sample index. - pts_filename (str): Filename of point clouds. - file_name (str): Filename of point clouds. - - img_prefix (str | None, optional): Prefix of image files. + - img_prefix (str, optional): Prefix of image files. - img_info (dict, optional): Image info. - ann_info (dict): Annotation info. """ @@ -129,12 +130,12 @@ def get_ann_info(self, index): Returns: dict: annotation information consists of the following keys: - - gt_bboxes_3d (:obj:`DepthInstance3DBoxes`): \ + - gt_bboxes_3d (:obj:`DepthInstance3DBoxes`): 3D ground truth bboxes - gt_labels_3d (np.ndarray): Labels of ground truths. - pts_instance_mask_path (str): Path of instance masks. - pts_semantic_mask_path (str): Path of semantic masks. - - axis_align_matrix (np.ndarray): Transformation matrix for \ + - axis_align_matrix (np.ndarray): Transformation matrix for global scene alignment. """ # Use index to get the annos, thus the evalhook could also use this api @@ -172,7 +173,7 @@ def get_ann_info(self, index): def prepare_test_data(self, index): """Prepare data for testing. - We should take axis_align_matrix from self.data_infos since we need \ + We should take axis_align_matrix from self.data_infos since we need to align point clouds. Args: @@ -272,7 +273,7 @@ class ScanNetSegDataset(Custom3DSegDataset): as input. Defaults to None. test_mode (bool, optional): Whether the dataset is in test mode. Defaults to False. - ignore_index (int, optional): The label index to be ignored, e.g. \ + ignore_index (int, optional): The label index to be ignored, e.g. unannotated points. If None is given, set to len(self.CLASSES). Defaults to None. scene_idxs (np.ndarray | str, optional): Precomputed index to load @@ -424,7 +425,7 @@ def format_results(self, results, txtfile_prefix=None): Args: outputs (list[dict]): Testing results of the dataset. - txtfile_prefix (str | None): The prefix of saved files. It includes + txtfile_prefix (str): The prefix of saved files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. diff --git a/mmdet3d/datasets/sunrgbd_dataset.py b/mmdet3d/datasets/sunrgbd_dataset.py index 7a9a7d59c2..dbef5df62f 100644 --- a/mmdet3d/datasets/sunrgbd_dataset.py +++ b/mmdet3d/datasets/sunrgbd_dataset.py @@ -1,8 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. -import numpy as np from collections import OrderedDict from os import path as osp +import numpy as np + from mmdet3d.core import show_multi_modality_result, show_result from mmdet3d.core.bbox import DepthInstance3DBoxes from mmdet.core import eval_map @@ -74,13 +75,13 @@ def get_data_info(self, index): index (int): Index of the sample data to get. Returns: - dict: Data information that will be passed to the data \ + dict: Data information that will be passed to the data preprocessing pipelines. It includes the following keys: - sample_idx (str): Sample index. - pts_filename (str, optional): Filename of point clouds. - file_name (str, optional): Filename of point clouds. - - img_prefix (str | None, optional): Prefix of image files. + - img_prefix (str, optional): Prefix of image files. - img_info (dict, optional): Image info. - calib (dict, optional): Camera calibration info. - ann_info (dict): Annotation info. @@ -125,7 +126,7 @@ def get_ann_info(self, index): Returns: dict: annotation information consists of the following keys: - - gt_bboxes_3d (:obj:`DepthInstance3DBoxes`): \ + - gt_bboxes_3d (:obj:`DepthInstance3DBoxes`): 3D ground truth bboxes - gt_labels_3d (np.ndarray): Labels of ground truths. - pts_instance_mask_path (str): Path of instance masks. @@ -239,12 +240,15 @@ def evaluate(self, Args: results (list[dict]): List of results. - metric (str | list[str]): Metrics to be evaluated. - iou_thr (list[float]): AP IoU thresholds. - iou_thr_2d (list[float]): AP IoU thresholds for 2d evaluation. - show (bool): Whether to visualize. + metric (str | list[str], optional): Metrics to be evaluated. + Default: None. + iou_thr (list[float], optional): AP IoU thresholds for 3D + evaluation. Default: (0.25, 0.5). + iou_thr_2d (list[float], optional): AP IoU thresholds for 2D + evaluation. Default: (0.5, ). + show (bool, optional): Whether to visualize. Default: False. - out_dir (str): Path to save the visualization results. + out_dir (str, optional): Path to save the visualization results. Default: None. pipeline (list[dict], optional): raw data loading for showing. Default: None. diff --git a/mmdet3d/datasets/utils.py b/mmdet3d/datasets/utils.py index b273fb1799..3b5c91ca7b 100644 --- a/mmdet3d/datasets/utils.py +++ b/mmdet3d/datasets/utils.py @@ -12,7 +12,7 @@ PointSegClassMapping) # yapf: enable from mmdet.datasets.builder import PIPELINES -from mmdet.datasets.pipelines import LoadImageFromFile +from mmdet.datasets.pipelines import LoadImageFromFile, MultiScaleFlipAug def is_loading_function(transform): @@ -25,7 +25,7 @@ def is_loading_function(transform): transform (dict | :obj:`Pipeline`): A transform config or a function. Returns: - bool | None: Whether it is a loading function. None means can't judge. + bool: Whether it is a loading function. None means can't judge. When transform is `MultiScaleFlipAug3D`, we return None. """ # TODO: use more elegant way to distinguish loading modules @@ -40,12 +40,12 @@ def is_loading_function(transform): return False if obj_cls in loading_functions: return True - if obj_cls in (MultiScaleFlipAug3D, ): + if obj_cls in (MultiScaleFlipAug3D, MultiScaleFlipAug): return None elif callable(transform): if isinstance(transform, loading_functions): return True - if isinstance(transform, MultiScaleFlipAug3D): + if isinstance(transform, (MultiScaleFlipAug3D, MultiScaleFlipAug)): return None return False @@ -92,7 +92,7 @@ def get_loading_pipeline(pipeline): ... dict(type='Collect3D', ... keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d']) ... ] - >>> assert expected_pipelines ==\ + >>> assert expected_pipelines == \ ... get_loading_pipeline(pipelines) """ loading_pipeline = [] @@ -126,7 +126,7 @@ def extract_result_dict(results, key): key (str): Key of the desired data. Returns: - np.ndarray | torch.Tensor | None: Data term. + np.ndarray | torch.Tensor: Data term. """ if key not in results.keys(): return None diff --git a/mmdet3d/datasets/waymo_dataset.py b/mmdet3d/datasets/waymo_dataset.py index b42c67047f..dba2aa1bc1 100644 --- a/mmdet3d/datasets/waymo_dataset.py +++ b/mmdet3d/datasets/waymo_dataset.py @@ -1,11 +1,12 @@ # Copyright (c) OpenMMLab. All rights reserved. -import mmcv -import numpy as np import os import tempfile +from os import path as osp + +import mmcv +import numpy as np import torch from mmcv.utils import print_log -from os import path as osp from mmdet.datasets import DATASETS from ..core.bbox import Box3DMode, points_cam2img @@ -46,8 +47,9 @@ class WaymoDataset(KittiDataset): Defaults to True. test_mode (bool, optional): Whether the dataset is in test mode. Defaults to False. - pcd_limit_range (list): The range of point cloud used to filter - invalid predicted boxes. Default: [-85, -85, -5, 85, 85, 5]. + pcd_limit_range (list(float), optional): The range of point cloud used + to filter invalid predicted boxes. + Default: [-85, -85, -5, 85, 85, 5]. """ CLASSES = ('Car', 'Cyclist', 'Pedestrian') @@ -100,7 +102,7 @@ def get_data_info(self, index): - sample_idx (str): sample index - pts_filename (str): filename of point clouds - - img_prefix (str | None): prefix of image files + - img_prefix (str): prefix of image files - img_info (dict): image info - lidar2img (list[np.ndarray], optional): transformations from lidar to different cameras @@ -140,15 +142,15 @@ def format_results(self, Args: outputs (list[dict]): Testing results of the dataset. - pklfile_prefix (str | None): The prefix of pkl files. It includes + pklfile_prefix (str): The prefix of pkl files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. - submission_prefix (str | None): The prefix of submitted files. It + submission_prefix (str): The prefix of submitted files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. - data_format (str | None): Output data format. Default: 'waymo'. - Another supported choice is 'kitti'. + data_format (str, optional): Output data format. + Default: 'waymo'. Another supported choice is 'kitti'. Returns: tuple: (result_files, tmp_dir), result_files is a dict containing @@ -226,18 +228,18 @@ def evaluate(self, Args: results (list[dict]): Testing results of the dataset. - metric (str | list[str]): Metrics to be evaluated. + metric (str | list[str], optional): Metrics to be evaluated. Default: 'waymo'. Another supported metric is 'kitti'. - logger (logging.Logger | str | None): Logger used for printing + logger (logging.Logger | str, optional): Logger used for printing related information during evaluation. Default: None. - pklfile_prefix (str | None): The prefix of pkl files. It includes + pklfile_prefix (str, optional): The prefix of pkl files including the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. - submission_prefix (str | None): The prefix of submission datas. + submission_prefix (str, optional): The prefix of submission data. If not specified, the submission data will not be generated. - show (bool): Whether to visualize. + show (bool, optional): Whether to visualize. Default: False. - out_dir (str): Path to save the visualization results. + out_dir (str, optional): Path to save the visualization results. Default: None. pipeline (list[dict], optional): raw data loading for showing. Default: None. @@ -349,8 +351,8 @@ def evaluate(self, if tmp_dir is not None: tmp_dir.cleanup() - if show: - self.show(results, out_dir, pipeline=pipeline) + if show or out_dir: + self.show(results, out_dir, show=show, pipeline=pipeline) return ap_dict def bbox2result_kitti(self, @@ -364,8 +366,8 @@ def bbox2result_kitti(self, net_outputs (List[np.ndarray]): list of array storing the bbox and score class_nanes (List[String]): A list of class names - pklfile_prefix (str | None): The prefix of pkl file. - submission_prefix (str | None): The prefix of submission file. + pklfile_prefix (str): The prefix of pkl file. + submission_prefix (str): The prefix of submission file. Returns: List[dict]: A list of dict have the kitti 3d format @@ -494,7 +496,6 @@ def convert_valid_bboxes(self, box_dict, info): scores = box_dict['scores_3d'] labels = box_dict['labels_3d'] sample_idx = info['image']['image_idx'] - # TODO: remove the hack of yaw box_preds.limit_yaw(offset=0.5, period=np.pi * 2) if len(box_preds) == 0: diff --git a/mmdet3d/models/backbones/__init__.py b/mmdet3d/models/backbones/__init__.py index 0251a10456..9403bd72c5 100644 --- a/mmdet3d/models/backbones/__init__.py +++ b/mmdet3d/models/backbones/__init__.py @@ -1,5 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from mmdet.models.backbones import SSDVGG, HRNet, ResNet, ResNetV1d, ResNeXt +from .dgcnn import DGCNNBackbone +from .dla import DLANet from .multi_backbone import MultiBackbone from .nostem_regnet import NoStemRegNet from .pointnet2_sa_msg import PointNet2SAMSG @@ -8,5 +10,6 @@ __all__ = [ 'ResNet', 'ResNetV1d', 'ResNeXt', 'SSDVGG', 'HRNet', 'NoStemRegNet', - 'SECOND', 'PointNet2SASSG', 'PointNet2SAMSG', 'MultiBackbone' + 'SECOND', 'DGCNNBackbone', 'PointNet2SASSG', 'PointNet2SAMSG', + 'MultiBackbone', 'DLANet' ] diff --git a/mmdet3d/models/backbones/base_pointnet.py b/mmdet3d/models/backbones/base_pointnet.py index fb7c05448b..31439e6a64 100644 --- a/mmdet3d/models/backbones/base_pointnet.py +++ b/mmdet3d/models/backbones/base_pointnet.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import warnings from abc import ABCMeta + from mmcv.runner import BaseModule diff --git a/mmdet3d/models/backbones/dgcnn.py b/mmdet3d/models/backbones/dgcnn.py new file mode 100644 index 0000000000..fe369890e0 --- /dev/null +++ b/mmdet3d/models/backbones/dgcnn.py @@ -0,0 +1,98 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.runner import BaseModule, auto_fp16 +from torch import nn as nn + +from mmdet3d.ops import DGCNNFAModule, DGCNNGFModule +from mmdet.models import BACKBONES + + +@BACKBONES.register_module() +class DGCNNBackbone(BaseModule): + """Backbone network for DGCNN. + + Args: + in_channels (int): Input channels of point cloud. + num_samples (tuple[int], optional): The number of samples for knn or + ball query in each graph feature (GF) module. + Defaults to (20, 20, 20). + knn_modes (tuple[str], optional): Mode of KNN of each knn module. + Defaults to ('D-KNN', 'F-KNN', 'F-KNN'). + radius (tuple[float], optional): Sampling radii of each GF module. + Defaults to (None, None, None). + gf_channels (tuple[tuple[int]], optional): Out channels of each mlp in + GF module. Defaults to ((64, 64), (64, 64), (64, )). + fa_channels (tuple[int], optional): Out channels of each mlp in FA + module. Defaults to (1024, ). + act_cfg (dict, optional): Config of activation layer. + Defaults to dict(type='ReLU'). + init_cfg (dict, optional): Initialization config. + Defaults to None. + """ + + def __init__(self, + in_channels, + num_samples=(20, 20, 20), + knn_modes=('D-KNN', 'F-KNN', 'F-KNN'), + radius=(None, None, None), + gf_channels=((64, 64), (64, 64), (64, )), + fa_channels=(1024, ), + act_cfg=dict(type='ReLU'), + init_cfg=None): + super().__init__(init_cfg=init_cfg) + self.num_gf = len(gf_channels) + + assert len(num_samples) == len(knn_modes) == len(radius) == len( + gf_channels), 'Num_samples, knn_modes, radius and gf_channels \ + should have the same length.' + + self.GF_modules = nn.ModuleList() + gf_in_channel = in_channels * 2 + skip_channel_list = [gf_in_channel] # input channel list + + for gf_index in range(self.num_gf): + cur_gf_mlps = list(gf_channels[gf_index]) + cur_gf_mlps = [gf_in_channel] + cur_gf_mlps + gf_out_channel = cur_gf_mlps[-1] + + self.GF_modules.append( + DGCNNGFModule( + mlp_channels=cur_gf_mlps, + num_sample=num_samples[gf_index], + knn_mode=knn_modes[gf_index], + radius=radius[gf_index], + act_cfg=act_cfg)) + skip_channel_list.append(gf_out_channel) + gf_in_channel = gf_out_channel * 2 + + fa_in_channel = sum(skip_channel_list[1:]) + cur_fa_mlps = list(fa_channels) + cur_fa_mlps = [fa_in_channel] + cur_fa_mlps + + self.FA_module = DGCNNFAModule( + mlp_channels=cur_fa_mlps, act_cfg=act_cfg) + + @auto_fp16(apply_to=('points', )) + def forward(self, points): + """Forward pass. + + Args: + points (torch.Tensor): point coordinates with features, + with shape (B, N, in_channels). + + Returns: + dict[str, list[torch.Tensor]]: Outputs after graph feature (GF) and + feature aggregation (FA) modules. + + - gf_points (list[torch.Tensor]): Outputs after each GF module. + - fa_points (torch.Tensor): Outputs after FA module. + """ + gf_points = [points] + + for i in range(self.num_gf): + cur_points = self.GF_modules[i](gf_points[i]) + gf_points.append(cur_points) + + fa_points = self.FA_module(gf_points) + + out = dict(gf_points=gf_points, fa_points=fa_points) + return out diff --git a/mmdet3d/models/backbones/dla.py b/mmdet3d/models/backbones/dla.py new file mode 100644 index 0000000000..85e388f4dd --- /dev/null +++ b/mmdet3d/models/backbones/dla.py @@ -0,0 +1,446 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import torch +from mmcv.cnn import build_conv_layer, build_norm_layer +from mmcv.runner import BaseModule +from torch import nn + +from mmdet.models.builder import BACKBONES + + +def dla_build_norm_layer(cfg, num_features): + """Build normalization layer specially designed for DLANet. + + Args: + cfg (dict): The norm layer config, which should contain: + + - type (str): Layer type. + - layer args: Args needed to instantiate a norm layer. + - requires_grad (bool, optional): Whether stop gradient updates. + num_features (int): Number of input channels. + + + Returns: + Function: Build normalization layer in mmcv. + """ + cfg_ = cfg.copy() + if cfg_['type'] == 'GN': + if num_features % 32 == 0: + return build_norm_layer(cfg_, num_features) + else: + assert 'num_groups' in cfg_ + cfg_['num_groups'] = cfg_['num_groups'] // 2 + return build_norm_layer(cfg_, num_features) + else: + return build_norm_layer(cfg_, num_features) + + +class BasicBlock(BaseModule): + """BasicBlock in DLANet. + + Args: + in_channels (int): Input feature channel. + out_channels (int): Output feature channel. + norm_cfg (dict): Dictionary to construct and config + norm layer. + conv_cfg (dict): Dictionary to construct and config + conv layer. + stride (int, optional): Conv stride. Default: 1. + dilation (int, optional): Conv dilation. Default: 1. + init_cfg (dict, optional): Initialization config. + Default: None. + """ + + def __init__(self, + in_channels, + out_channels, + norm_cfg, + conv_cfg, + stride=1, + dilation=1, + init_cfg=None): + super(BasicBlock, self).__init__(init_cfg) + self.conv1 = build_conv_layer( + conv_cfg, + in_channels, + out_channels, + 3, + stride=stride, + padding=dilation, + dilation=dilation, + bias=False) + self.norm1 = dla_build_norm_layer(norm_cfg, out_channels)[1] + self.relu = nn.ReLU(inplace=True) + self.conv2 = build_conv_layer( + conv_cfg, + out_channels, + out_channels, + 3, + stride=1, + padding=dilation, + dilation=dilation, + bias=False) + self.norm2 = dla_build_norm_layer(norm_cfg, out_channels)[1] + self.stride = stride + + def forward(self, x, identity=None): + """Forward function.""" + + if identity is None: + identity = x + out = self.conv1(x) + out = self.norm1(out) + out = self.relu(out) + out = self.conv2(out) + out = self.norm2(out) + out += identity + out = self.relu(out) + + return out + + +class Root(BaseModule): + """Root in DLANet. + + Args: + in_channels (int): Input feature channel. + out_channels (int): Output feature channel. + norm_cfg (dict): Dictionary to construct and config + norm layer. + conv_cfg (dict): Dictionary to construct and config + conv layer. + kernel_size (int): Size of convolution kernel. + add_identity (bool): Whether to add identity in root. + init_cfg (dict, optional): Initialization config. + Default: None. + """ + + def __init__(self, + in_channels, + out_channels, + norm_cfg, + conv_cfg, + kernel_size, + add_identity, + init_cfg=None): + super(Root, self).__init__(init_cfg) + self.conv = build_conv_layer( + conv_cfg, + in_channels, + out_channels, + 1, + stride=1, + padding=(kernel_size - 1) // 2, + bias=False) + self.norm = dla_build_norm_layer(norm_cfg, out_channels)[1] + self.relu = nn.ReLU(inplace=True) + self.add_identity = add_identity + + def forward(self, feat_list): + """Forward function. + + Args: + feat_list (list[torch.Tensor]): Output features from + multiple layers. + """ + children = feat_list + x = self.conv(torch.cat(feat_list, 1)) + x = self.norm(x) + if self.add_identity: + x += children[0] + x = self.relu(x) + + return x + + +class Tree(BaseModule): + """Tree in DLANet. + + Args: + levels (int): The level of the tree. + block (nn.Module): The block module in tree. + in_channels: Input feature channel. + out_channels: Output feature channel. + norm_cfg (dict): Dictionary to construct and config + norm layer. + conv_cfg (dict): Dictionary to construct and config + conv layer. + stride (int, optional): Convolution stride. + Default: 1. + level_root (bool, optional): whether belongs to the + root layer. + root_dim (int, optional): Root input feature channel. + root_kernel_size (int, optional): Size of root + convolution kernel. Default: 1. + dilation (int, optional): Conv dilation. Default: 1. + add_identity (bool, optional): Whether to add + identity in root. Default: False. + init_cfg (dict, optional): Initialization config. + Default: None. + """ + + def __init__(self, + levels, + block, + in_channels, + out_channels, + norm_cfg, + conv_cfg, + stride=1, + level_root=False, + root_dim=None, + root_kernel_size=1, + dilation=1, + add_identity=False, + init_cfg=None): + super(Tree, self).__init__(init_cfg) + if root_dim is None: + root_dim = 2 * out_channels + if level_root: + root_dim += in_channels + if levels == 1: + self.root = Root(root_dim, out_channels, norm_cfg, conv_cfg, + root_kernel_size, add_identity) + self.tree1 = block( + in_channels, + out_channels, + norm_cfg, + conv_cfg, + stride, + dilation=dilation) + self.tree2 = block( + out_channels, + out_channels, + norm_cfg, + conv_cfg, + 1, + dilation=dilation) + else: + self.tree1 = Tree( + levels - 1, + block, + in_channels, + out_channels, + norm_cfg, + conv_cfg, + stride, + root_dim=None, + root_kernel_size=root_kernel_size, + dilation=dilation, + add_identity=add_identity) + self.tree2 = Tree( + levels - 1, + block, + out_channels, + out_channels, + norm_cfg, + conv_cfg, + root_dim=root_dim + out_channels, + root_kernel_size=root_kernel_size, + dilation=dilation, + add_identity=add_identity) + self.level_root = level_root + self.root_dim = root_dim + self.downsample = None + self.project = None + self.levels = levels + if stride > 1: + self.downsample = nn.MaxPool2d(stride, stride=stride) + if in_channels != out_channels: + self.project = nn.Sequential( + build_conv_layer( + conv_cfg, + in_channels, + out_channels, + 1, + stride=1, + bias=False), + dla_build_norm_layer(norm_cfg, out_channels)[1]) + + def forward(self, x, identity=None, children=None): + children = [] if children is None else children + bottom = self.downsample(x) if self.downsample else x + identity = self.project(bottom) if self.project else bottom + if self.level_root: + children.append(bottom) + x1 = self.tree1(x, identity) + if self.levels == 1: + x2 = self.tree2(x1) + feat_list = [x2, x1] + children + x = self.root(feat_list) + else: + children.append(x1) + x = self.tree2(x1, children=children) + return x + + +@BACKBONES.register_module() +class DLANet(BaseModule): + r"""`DLA backbone `_. + + Args: + depth (int): Depth of DLA. Default: 34. + in_channels (int, optional): Number of input image channels. + Default: 3. + norm_cfg (dict, optional): Dictionary to construct and config + norm layer. Default: None. + conv_cfg (dict, optional): Dictionary to construct and config + conv layer. Default: None. + layer_with_level_root (list[bool], optional): Whether to apply + level_root in each DLA layer, this is only used for + tree levels. Default: (False, True, True, True). + with_identity_root (bool, optional): Whether to add identity + in root layer. Default: False. + pretrained (str, optional): model pretrained path. + Default: None. + init_cfg (dict or list[dict], optional): Initialization + config dict. Default: None + """ + arch_settings = { + 34: (BasicBlock, (1, 1, 1, 2, 2, 1), (16, 32, 64, 128, 256, 512)), + } + + def __init__(self, + depth, + in_channels=3, + out_indices=(0, 1, 2, 3, 4, 5), + frozen_stages=-1, + norm_cfg=None, + conv_cfg=None, + layer_with_level_root=(False, True, True, True), + with_identity_root=False, + pretrained=None, + init_cfg=None): + super(DLANet, self).__init__(init_cfg) + if depth not in self.arch_settings: + raise KeyError(f'invalida depth {depth} for DLA') + + assert not (init_cfg and pretrained), \ + 'init_cfg and pretrained cannot be setting at the same time' + if isinstance(pretrained, str): + warnings.warn('DeprecationWarning: pretrained is a deprecated, ' + 'please use "init_cfg" instead') + self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) + elif pretrained is None: + if init_cfg is None: + self.init_cfg = [ + dict(type='Kaiming', layer='Conv2d'), + dict( + type='Constant', + val=1, + layer=['_BatchNorm', 'GroupNorm']) + ] + + block, levels, channels = self.arch_settings[depth] + self.channels = channels + self.num_levels = len(levels) + self.frozen_stages = frozen_stages + self.out_indices = out_indices + assert max(out_indices) < self.num_levels + self.base_layer = nn.Sequential( + build_conv_layer( + conv_cfg, + in_channels, + channels[0], + 7, + stride=1, + padding=3, + bias=False), + dla_build_norm_layer(norm_cfg, channels[0])[1], + nn.ReLU(inplace=True)) + + # DLANet first uses two conv layers then uses several + # Tree layers + for i in range(2): + level_layer = self._make_conv_level( + channels[0], + channels[i], + levels[i], + norm_cfg, + conv_cfg, + stride=i + 1) + layer_name = f'level{i}' + self.add_module(layer_name, level_layer) + + for i in range(2, self.num_levels): + dla_layer = Tree( + levels[i], + block, + channels[i - 1], + channels[i], + norm_cfg, + conv_cfg, + 2, + level_root=layer_with_level_root[i - 2], + add_identity=with_identity_root) + layer_name = f'level{i}' + self.add_module(layer_name, dla_layer) + + self._freeze_stages() + + def _make_conv_level(self, + in_channels, + out_channels, + num_convs, + norm_cfg, + conv_cfg, + stride=1, + dilation=1): + """Conv modules. + + Args: + in_channels (int): Input feature channel. + out_channels (int): Output feature channel. + num_convs (int): Number of Conv module. + norm_cfg (dict): Dictionary to construct and config + norm layer. + conv_cfg (dict): Dictionary to construct and config + conv layer. + stride (int, optional): Conv stride. Default: 1. + dilation (int, optional): Conv dilation. Default: 1. + """ + modules = [] + for i in range(num_convs): + modules.extend([ + build_conv_layer( + conv_cfg, + in_channels, + out_channels, + 3, + stride=stride if i == 0 else 1, + padding=dilation, + bias=False, + dilation=dilation), + dla_build_norm_layer(norm_cfg, out_channels)[1], + nn.ReLU(inplace=True) + ]) + in_channels = out_channels + return nn.Sequential(*modules) + + def _freeze_stages(self): + if self.frozen_stages >= 0: + self.base_layer.eval() + for param in self.base_layer.parameters(): + param.requires_grad = False + + for i in range(2): + m = getattr(self, f'level{i}') + m.eval() + for param in m.parameters(): + param.requires_grad = False + + for i in range(1, self.frozen_stages + 1): + m = getattr(self, f'level{i+1}') + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def forward(self, x): + outs = [] + x = self.base_layer(x) + for i in range(self.num_levels): + x = getattr(self, 'level{}'.format(i))(x) + if i in self.out_indices: + outs.append(x) + return tuple(outs) diff --git a/mmdet3d/models/backbones/multi_backbone.py b/mmdet3d/models/backbones/multi_backbone.py index b1e1da5b18..1e9d0703f3 100644 --- a/mmdet3d/models/backbones/multi_backbone.py +++ b/mmdet3d/models/backbones/multi_backbone.py @@ -1,7 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. import copy -import torch import warnings + +import torch from mmcv.cnn import ConvModule from mmcv.runner import BaseModule, auto_fp16 from torch import nn as nn diff --git a/mmdet3d/models/backbones/nostem_regnet.py b/mmdet3d/models/backbones/nostem_regnet.py index e4c34e2451..3090508330 100644 --- a/mmdet3d/models/backbones/nostem_regnet.py +++ b/mmdet3d/models/backbones/nostem_regnet.py @@ -16,7 +16,7 @@ class NoStemRegNet(RegNet): - wm (float): Quantization parameter to quantize the width. - depth (int): Depth of the backbone. - group_w (int): Width of group. - - bot_mul (float): Bottleneck ratio, i.e. expansion of bottlneck. + - bot_mul (float): Bottleneck ratio, i.e. expansion of bottleneck. strides (Sequence[int]): Strides of the first block of each stage. base_channels (int): Base channels after stem layer. in_channels (int): Number of input image channels. Normally 3. diff --git a/mmdet3d/models/backbones/pointnet2_sa_msg.py b/mmdet3d/models/backbones/pointnet2_sa_msg.py index dcf3931aa9..7770f53141 100644 --- a/mmdet3d/models/backbones/pointnet2_sa_msg.py +++ b/mmdet3d/models/backbones/pointnet2_sa_msg.py @@ -64,7 +64,11 @@ def __init__(self, self.out_indices = out_indices assert max(out_indices) < self.num_sa assert len(num_points) == len(radii) == len(num_samples) == len( - sa_channels) == len(aggregation_channels) + sa_channels) + if aggregation_channels is not None: + assert len(sa_channels) == len(aggregation_channels) + else: + aggregation_channels = [None] * len(sa_channels) self.SA_modules = nn.ModuleList() self.aggregation_mlps = nn.ModuleList() @@ -134,7 +138,7 @@ def forward(self, points): - sa_xyz (torch.Tensor): The coordinates of sa features. - sa_features (torch.Tensor): The features from the last Set Aggregation Layers. - - sa_indices (torch.Tensor): Indices of the \ + - sa_indices (torch.Tensor): Indices of the input points. """ xyz, features = self._split_point_feats(points) diff --git a/mmdet3d/models/backbones/pointnet2_sa_ssg.py b/mmdet3d/models/backbones/pointnet2_sa_ssg.py index 2f3a0cc68a..eb5f4d6cad 100644 --- a/mmdet3d/models/backbones/pointnet2_sa_ssg.py +++ b/mmdet3d/models/backbones/pointnet2_sa_ssg.py @@ -97,11 +97,11 @@ def forward(self, points): Returns: dict[str, list[torch.Tensor]]: Outputs after SA and FP modules. - - fp_xyz (list[torch.Tensor]): The coordinates of \ + - fp_xyz (list[torch.Tensor]): The coordinates of each fp features. - - fp_features (list[torch.Tensor]): The features \ + - fp_features (list[torch.Tensor]): The features from each Feature Propagate Layers. - - fp_indices (list[torch.Tensor]): Indices of the \ + - fp_indices (list[torch.Tensor]): Indices of the input points. """ xyz, features = self._split_point_feats(points) diff --git a/mmdet3d/models/backbones/second.py b/mmdet3d/models/backbones/second.py index 680398c5d9..a09c5bb7f4 100644 --- a/mmdet3d/models/backbones/second.py +++ b/mmdet3d/models/backbones/second.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import warnings + from mmcv.cnn import build_conv_layer, build_norm_layer from mmcv.runner import BaseModule from torch import nn as nn diff --git a/mmdet3d/models/builder.py b/mmdet3d/models/builder.py index 09d062420b..597e0a3398 100644 --- a/mmdet3d/models/builder.py +++ b/mmdet3d/models/builder.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import warnings + from mmcv.cnn import MODELS as MMCV_MODELS from mmcv.utils import Registry diff --git a/mmdet3d/models/decode_heads/__init__.py b/mmdet3d/models/decode_heads/__init__.py index e17d91da0c..2e86c7c8a9 100644 --- a/mmdet3d/models/decode_heads/__init__.py +++ b/mmdet3d/models/decode_heads/__init__.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. +from .dgcnn_head import DGCNNHead from .paconv_head import PAConvHead from .pointnet2_head import PointNet2Head -__all__ = ['PointNet2Head', 'PAConvHead'] +__all__ = ['PointNet2Head', 'DGCNNHead', 'PAConvHead'] diff --git a/mmdet3d/models/decode_heads/decode_head.py b/mmdet3d/models/decode_heads/decode_head.py index 672340b668..6ccbfe0ecd 100644 --- a/mmdet3d/models/decode_heads/decode_head.py +++ b/mmdet3d/models/decode_heads/decode_head.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from abc import ABCMeta, abstractmethod + from mmcv.cnn import normal_init from mmcv.runner import BaseModule, auto_fp16, force_fp32 from torch import nn as nn @@ -13,17 +14,18 @@ class Base3DDecodeHead(BaseModule, metaclass=ABCMeta): Args: channels (int): Channels after modules, before conv_seg. num_classes (int): Number of classes. - dropout_ratio (float): Ratio of dropout layer. Default: 0.5. - conv_cfg (dict|None): Config of conv layers. + dropout_ratio (float, optional): Ratio of dropout layer. Default: 0.5. + conv_cfg (dict, optional): Config of conv layers. Default: dict(type='Conv1d'). - norm_cfg (dict|None): Config of norm layers. + norm_cfg (dict, optional): Config of norm layers. Default: dict(type='BN1d'). - act_cfg (dict): Config of activation layers. + act_cfg (dict, optional): Config of activation layers. Default: dict(type='ReLU'). - loss_decode (dict): Config of decode loss. + loss_decode (dict, optional): Config of decode loss. Default: dict(type='CrossEntropyLoss'). - ignore_index (int | None): The label index to be ignored. When using - masked BCE loss, ignore_index should be set to None. Default: 255. + ignore_index (int, optional): The label index to be ignored. + When using masked BCE loss, ignore_index should be set to None. + Default: 255. """ def __init__(self, @@ -110,9 +112,9 @@ def losses(self, seg_logit, seg_label): """Compute semantic segmentation loss. Args: - seg_logit (torch.Tensor): Predicted per-point segmentation logits \ + seg_logit (torch.Tensor): Predicted per-point segmentation logits of shape [B, num_classes, N]. - seg_label (torch.Tensor): Ground-truth segmentation label of \ + seg_label (torch.Tensor): Ground-truth segmentation label of shape [B, N]. """ loss = dict() diff --git a/mmdet3d/models/decode_heads/dgcnn_head.py b/mmdet3d/models/decode_heads/dgcnn_head.py new file mode 100644 index 0000000000..4d4e1887bc --- /dev/null +++ b/mmdet3d/models/decode_heads/dgcnn_head.py @@ -0,0 +1,67 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.cnn.bricks import ConvModule + +from mmdet3d.ops import DGCNNFPModule +from mmdet.models import HEADS +from .decode_head import Base3DDecodeHead + + +@HEADS.register_module() +class DGCNNHead(Base3DDecodeHead): + r"""DGCNN decoder head. + + Decoder head used in `DGCNN `_. + Refer to the + `reimplementation code `_. + + Args: + fp_channels (tuple[int], optional): Tuple of mlp channels in feature + propagation (FP) modules. Defaults to (1216, 512). + """ + + def __init__(self, fp_channels=(1216, 512), **kwargs): + super(DGCNNHead, self).__init__(**kwargs) + + self.FP_module = DGCNNFPModule( + mlp_channels=fp_channels, act_cfg=self.act_cfg) + + # https://github.com/charlesq34/pointnet2/blob/master/models/pointnet2_sem_seg.py#L40 + self.pre_seg_conv = ConvModule( + fp_channels[-1], + self.channels, + kernel_size=1, + bias=False, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def _extract_input(self, feat_dict): + """Extract inputs from features dictionary. + + Args: + feat_dict (dict): Feature dict from backbone. + + Returns: + torch.Tensor: points for decoder. + """ + fa_points = feat_dict['fa_points'] + + return fa_points + + def forward(self, feat_dict): + """Forward pass. + + Args: + feat_dict (dict): Feature dict from backbone. + + Returns: + torch.Tensor: Segmentation map of shape [B, num_classes, N]. + """ + fa_points = self._extract_input(feat_dict) + + fp_points = self.FP_module(fa_points) + fp_points = fp_points.transpose(1, 2).contiguous() + output = self.pre_seg_conv(fp_points) + output = self.cls_seg(output) + + return output diff --git a/mmdet3d/models/decode_heads/paconv_head.py b/mmdet3d/models/decode_heads/paconv_head.py index 6ace064264..e662c976c2 100644 --- a/mmdet3d/models/decode_heads/paconv_head.py +++ b/mmdet3d/models/decode_heads/paconv_head.py @@ -14,7 +14,7 @@ class PAConvHead(PointNet2Head): Args: fp_channels (tuple[tuple[int]]): Tuple of mlp channels in FP modules. - fp_norm_cfg (dict|None): Config of norm layers used in FP modules. + fp_norm_cfg (dict): Config of norm layers used in FP modules. Default: dict(type='BN2d'). """ diff --git a/mmdet3d/models/decode_heads/pointnet2_head.py b/mmdet3d/models/decode_heads/pointnet2_head.py index c7fe0d553d..0585df6ab4 100644 --- a/mmdet3d/models/decode_heads/pointnet2_head.py +++ b/mmdet3d/models/decode_heads/pointnet2_head.py @@ -16,7 +16,7 @@ class PointNet2Head(Base3DDecodeHead): Args: fp_channels (tuple[tuple[int]]): Tuple of mlp channels in FP modules. - fp_norm_cfg (dict|None): Config of norm layers used in FP modules. + fp_norm_cfg (dict): Config of norm layers used in FP modules. Default: dict(type='BN2d'). """ diff --git a/mmdet3d/models/dense_heads/__init__.py b/mmdet3d/models/dense_heads/__init__.py index d2283786db..25008c95bc 100644 --- a/mmdet3d/models/dense_heads/__init__.py +++ b/mmdet3d/models/dense_heads/__init__.py @@ -7,8 +7,12 @@ from .fcos_mono3d_head import FCOSMono3DHead from .free_anchor3d_head import FreeAnchor3DHead from .groupfree3d_head import GroupFree3DHead +from .monoflex_head import MonoFlexHead from .parta2_rpn_head import PartA2RPNHead +from .pgd_head import PGDHead +from .point_rpn_head import PointRPNHead from .shape_aware_head import ShapeAwareHead +from .smoke_mono3d_head import SMOKEMono3DHead from .ssd_3d_head import SSD3DHead from .vote_head import VoteHead @@ -16,5 +20,6 @@ 'Anchor3DHead', 'FreeAnchor3DHead', 'PartA2RPNHead', 'VoteHead', 'SSD3DHead', 'BaseConvBboxHead', 'CenterHead', 'ShapeAwareHead', 'BaseMono3DDenseHead', 'AnchorFreeMono3DHead', 'FCOSMono3DHead', - 'GroupFree3DHead' + 'GroupFree3DHead', 'PointRPNHead', 'SMOKEMono3DHead', 'PGDHead', + 'MonoFlexHead' ] diff --git a/mmdet3d/models/dense_heads/anchor3d_head.py b/mmdet3d/models/dense_heads/anchor3d_head.py index 24eb7f9791..792000e1da 100644 --- a/mmdet3d/models/dense_heads/anchor3d_head.py +++ b/mmdet3d/models/dense_heads/anchor3d_head.py @@ -51,15 +51,15 @@ def __init__(self, type='Anchor3DRangeGenerator', range=[0, -39.68, -1.78, 69.12, 39.68, -1.78], strides=[2], - sizes=[[1.6, 3.9, 1.56]], + sizes=[[3.9, 1.6, 1.56]], rotations=[0, 1.57], custom_values=[], reshape_out=False), assigner_per_size=False, assign_per_class=False, diff_rad_by_sin=True, - dir_offset=0, - dir_limit_offset=1, + dir_offset=-np.pi / 2, + dir_limit_offset=0, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), loss_cls=dict( type='CrossEntropyLoss', @@ -81,6 +81,10 @@ def __init__(self, self.assign_per_class = assign_per_class self.dir_offset = dir_offset self.dir_limit_offset = dir_limit_offset + import warnings + warnings.warn( + 'dir_offset and dir_limit_offset will be depressed and be ' + 'incorporated into box coder in the future') self.fp16_enabled = False # build anchor generator @@ -145,7 +149,7 @@ def forward_single(self, x): x (torch.Tensor): Input features. Returns: - tuple[torch.Tensor]: Contain score of each class, bbox \ + tuple[torch.Tensor]: Contain score of each class, bbox regression and direction classification predictions. """ cls_score = self.conv_cls(x) @@ -163,7 +167,7 @@ def forward(self, feats): features produced by FPN. Returns: - tuple[list[torch.Tensor]]: Multi-level class score, bbox \ + tuple[list[torch.Tensor]]: Multi-level class score, bbox and direction predictions. """ return multi_apply(self.forward_single, feats) @@ -177,7 +181,7 @@ def get_anchors(self, featmap_sizes, input_metas, device='cuda'): device (str): device of current module. Returns: - list[list[torch.Tensor]]: Anchors of each image, valid flags \ + list[list[torch.Tensor]]: Anchors of each image, valid flags of each image. """ num_imgs = len(input_metas) @@ -207,7 +211,7 @@ def loss_single(self, cls_score, bbox_pred, dir_cls_preds, labels, num_total_samples (int): The number of valid samples. Returns: - tuple[torch.Tensor]: Losses of class, bbox \ + tuple[torch.Tensor]: Losses of class, bbox and direction, respectively. """ # classification loss @@ -285,7 +289,7 @@ def add_sin_difference(boxes1, boxes2): the 7th dimension is rotation dimension. Returns: - tuple[torch.Tensor]: ``boxes1`` and ``boxes2`` whose 7th \ + tuple[torch.Tensor]: ``boxes1`` and ``boxes2`` whose 7th dimensions are changed. """ rad_pred_encoding = torch.sin(boxes1[..., 6:7]) * torch.cos( @@ -318,16 +322,16 @@ class predictions. of each sample. gt_labels (list[torch.Tensor]): Gt labels of each sample. input_metas (list[dict]): Contain pcd and img's meta info. - gt_bboxes_ignore (None | list[torch.Tensor]): Specify - which bounding. + gt_bboxes_ignore (list[torch.Tensor]): Specify + which bounding boxes to ignore. Returns: - dict[str, list[torch.Tensor]]: Classification, bbox, and \ + dict[str, list[torch.Tensor]]: Classification, bbox, and direction losses of each level. - loss_cls (list[torch.Tensor]): Classification losses. - loss_bbox (list[torch.Tensor]): Box regression losses. - - loss_dir (list[torch.Tensor]): Direction classification \ + - loss_dir (list[torch.Tensor]): Direction classification losses. """ featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] @@ -385,7 +389,7 @@ def get_bboxes(self, dir_cls_preds (list[torch.Tensor]): Multi-level direction class predictions. input_metas (list[dict]): Contain pcd and img's meta info. - cfg (None | :obj:`ConfigDict`): Training or testing config. + cfg (:obj:`ConfigDict`): Training or testing config. rescale (list[torch.Tensor]): Whether th rescale bbox. Returns: @@ -439,7 +443,7 @@ def get_bboxes_single(self, mlvl_anchors (List[torch.Tensor]): Multi-level anchors in single batch. input_meta (list[dict]): Contain pcd and img's meta info. - cfg (None | :obj:`ConfigDict`): Training or testing config. + cfg (:obj:`ConfigDict`): Training or testing config. rescale (list[torch.Tensor]): whether th rescale bbox. Returns: diff --git a/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py b/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py index dc9aa2cd80..7953a027c0 100644 --- a/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py +++ b/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -import torch from abc import abstractmethod + +import torch from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init from mmcv.runner import force_fp32 from torch import nn as nn @@ -18,35 +19,45 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead): num_classes (int): Number of categories excluding the background category. in_channels (int): Number of channels in the input feature map. - feat_channels (int): Number of hidden channels. Used in child classes. - stacked_convs (int): Number of stacking convs of the head. - strides (tuple): Downsample factor of each feature map. - dcn_on_last_conv (bool): If true, use dcn in the last layer of - towers. Default: False. - conv_bias (bool | str): If specified as `auto`, it will be decided by - the norm_cfg. Bias of conv will be set as True if `norm_cfg` is - None, otherwise False. Default: "auto". - background_label (int | None): Label ID of background, set as 0 for - RPN and num_classes for other heads. It will automatically set as - num_classes if None is given. - use_direction_classifier (bool): Whether to add a direction classifier. - diff_rad_by_sin (bool): Whether to change the difference into sin - difference for box regression loss. - loss_cls (dict): Config of classification loss. - loss_bbox (dict): Config of localization loss. - loss_dir (dict): Config of direction classifier loss. - loss_attr (dict): Config of attribute classifier loss, which is only - active when pred_attrs=True. - bbox_code_size (int): Dimensions of predicted bounding boxes. - pred_attrs (bool): Whether to predict attributes. Default to False. - num_attrs (int): The number of attributes to be predicted. Default: 9. - pred_velo (bool): Whether to predict velocity. Default to False. - pred_bbox2d (bool): Whether to predict 2D boxes. Default to False. - group_reg_dims (tuple[int]): The dimension of each regression target - group. Default: (2, 1, 3, 1, 2). - cls_branch (tuple[int]): Channels for classification branch. + feat_channels (int, optional): Number of hidden channels. + Used in child classes. Defaults to 256. + stacked_convs (int, optional): Number of stacking convs of the head. + strides (tuple, optional): Downsample factor of each feature map. + dcn_on_last_conv (bool, optional): If true, use dcn in the last + layer of towers. Default: False. + conv_bias (bool | str, optional): If specified as `auto`, it will be + decided by the norm_cfg. Bias of conv will be set as True + if `norm_cfg` is None, otherwise False. Default: 'auto'. + background_label (int, optional): Label ID of background, + set as 0 for RPN and num_classes for other heads. + It will automatically set as `num_classes` if None is given. + use_direction_classifier (bool, optional): + Whether to add a direction classifier. + diff_rad_by_sin (bool, optional): Whether to change the difference + into sin difference for box regression loss. Defaults to True. + dir_offset (float, optional): Parameter used in direction + classification. Defaults to 0. + dir_limit_offset (float, optional): Parameter used in direction + classification. Defaults to 0. + loss_cls (dict, optional): Config of classification loss. + loss_bbox (dict, optional): Config of localization loss. + loss_dir (dict, optional): Config of direction classifier loss. + loss_attr (dict, optional): Config of attribute classifier loss, + which is only active when `pred_attrs=True`. + bbox_code_size (int, optional): Dimensions of predicted bounding boxes. + pred_attrs (bool, optional): Whether to predict attributes. + Defaults to False. + num_attrs (int, optional): The number of attributes to be predicted. + Default: 9. + pred_velo (bool, optional): Whether to predict velocity. + Defaults to False. + pred_bbox2d (bool, optional): Whether to predict 2D boxes. + Defaults to False. + group_reg_dims (tuple[int], optional): The dimension of each regression + target group. Default: (2, 1, 3, 1, 2). + cls_branch (tuple[int], optional): Channels for classification branch. Default: (128, 64). - reg_branch (tuple[tuple]): Channels for regression branch. + reg_branch (tuple[tuple], optional): Channels for regression branch. Default: ( (128, 64), # offset (128, 64), # depth @@ -54,14 +65,16 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead): (64, ), # rot () # velo ), - dir_branch (tuple[int]): Channels for direction classification branch. - Default: (64, ). - attr_branch (tuple[int]): Channels for classification branch. + dir_branch (tuple[int], optional): Channels for direction + classification branch. Default: (64, ). + attr_branch (tuple[int], optional): Channels for classification branch. Default: (64, ). - conv_cfg (dict): Config dict for convolution layer. Default: None. - norm_cfg (dict): Config dict for normalization layer. Default: None. - train_cfg (dict): Training config of anchor head. - test_cfg (dict): Testing config of anchor head. + conv_cfg (dict, optional): Config dict for convolution layer. + Default: None. + norm_cfg (dict, optional): Config dict for normalization layer. + Default: None. + train_cfg (dict, optional): Training config of anchor head. + test_cfg (dict, optional): Testing config of anchor head. """ # noqa: W605 _version = 1 @@ -79,6 +92,7 @@ def __init__( use_direction_classifier=True, diff_rad_by_sin=True, dir_offset=0, + dir_limit_offset=0, loss_cls=dict( type='FocalLoss', use_sigmoid=True, @@ -125,6 +139,7 @@ def __init__( self.use_direction_classifier = use_direction_classifier self.diff_rad_by_sin = diff_rad_by_sin self.dir_offset = dir_offset + self.dir_limit_offset = dir_limit_offset self.loss_cls = build_loss(loss_cls) self.loss_bbox = build_loss(loss_bbox) self.loss_dir = build_loss(loss_dir) @@ -162,13 +177,6 @@ def __init__( self.attr_branch = attr_branch self._init_layers() - if init_cfg is None: - self.init_cfg = dict( - type='Normal', - layer='Conv2d', - std=0.01, - override=dict( - type='Normal', name='conv_cls', std=0.01, bias_prob=0.01)) def _init_layers(self): """Initialize layers of the head.""" @@ -274,8 +282,34 @@ def _init_predictor(self): self.conv_attr = nn.Conv2d(self.attr_branch[-1], self.num_attrs, 1) def init_weights(self): - super().init_weights() + """Initialize weights of the head. + + We currently still use the customized defined init_weights because the + default init of DCN triggered by the init_cfg will init + conv_offset.weight, which mistakenly affects the training stability. + """ + for modules in [self.cls_convs, self.reg_convs, self.conv_cls_prev]: + for m in modules: + if isinstance(m.conv, nn.Conv2d): + normal_init(m.conv, std=0.01) + for conv_reg_prev in self.conv_reg_prevs: + if conv_reg_prev is None: + continue + for m in conv_reg_prev: + if isinstance(m.conv, nn.Conv2d): + normal_init(m.conv, std=0.01) + if self.use_direction_classifier: + for m in self.conv_dir_cls_prev: + if isinstance(m.conv, nn.Conv2d): + normal_init(m.conv, std=0.01) + if self.pred_attrs: + for m in self.conv_attr_prev: + if isinstance(m.conv, nn.Conv2d): + normal_init(m.conv, std=0.01) bias_cls = bias_init_with_prob(0.01) + normal_init(self.conv_cls, std=0.01, bias=bias_cls) + for conv_reg in self.conv_regs: + normal_init(conv_reg, std=0.01) if self.use_direction_classifier: normal_init(self.conv_dir_cls, std=0.01, bias=bias_cls) if self.pred_attrs: @@ -289,7 +323,7 @@ def forward(self, feats): a 4D-tensor. Returns: - tuple: Usually contain classification scores, bbox predictions, \ + tuple: Usually contain classification scores, bbox predictions, and direction class predictions. cls_scores (list[Tensor]): Box scores for each scale level, each is a 4D-tensor, the channel number is @@ -307,7 +341,7 @@ def forward(self, feats): return multi_apply(self.forward_single, feats)[:5] def forward_single(self, x): - """Forward features of a single scale levle. + """Forward features of a single scale level. Args: x (Tensor): FPN feature maps of the specified stride. @@ -401,7 +435,7 @@ def loss(self, corresponding to each box img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. - gt_bboxes_ignore (None | list[Tensor]): specify which bounding + gt_bboxes_ignore (list[Tensor]): specify which bounding boxes can be ignored when computing the loss. """ diff --git a/mmdet3d/models/dense_heads/base_mono3d_dense_head.py b/mmdet3d/models/dense_heads/base_mono3d_dense_head.py index 322c7cf92d..2444473058 100644 --- a/mmdet3d/models/dense_heads/base_mono3d_dense_head.py +++ b/mmdet3d/models/dense_heads/base_mono3d_dense_head.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from abc import ABCMeta, abstractmethod + from mmcv.runner import BaseModule diff --git a/mmdet3d/models/dense_heads/centerpoint_head.py b/mmdet3d/models/dense_heads/centerpoint_head.py index aa6ff7cdda..0529968f77 100644 --- a/mmdet3d/models/dense_heads/centerpoint_head.py +++ b/mmdet3d/models/dense_heads/centerpoint_head.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import copy + import torch from mmcv.cnn import ConvModule, build_conv_layer from mmcv.runner import BaseModule, force_fp32 @@ -21,16 +22,16 @@ class SeparateHead(BaseModule): Args: in_channels (int): Input channels for conv_layer. heads (dict): Conv information. - head_conv (int): Output channels. + head_conv (int, optional): Output channels. Default: 64. - final_kernal (int): Kernal size for the last conv layer. - Deafult: 1. - init_bias (float): Initial bias. Default: -2.19. - conv_cfg (dict): Config of conv layer. + final_kernel (int, optional): Kernel size for the last conv layer. + Default: 1. + init_bias (float, optional): Initial bias. Default: -2.19. + conv_cfg (dict, optional): Config of conv layer. Default: dict(type='Conv2d') - norm_cfg (dict): Config of norm layer. + norm_cfg (dict, optional): Config of norm layer. Default: dict(type='BN2d'). - bias (str): Type of bias. Default: 'auto'. + bias (str, optional): Type of bias. Default: 'auto'. """ def __init__(self, @@ -100,17 +101,17 @@ def forward(self, x): Returns: dict[str: torch.Tensor]: contains the following keys: - -reg (torch.Tensor): 2D regression value with the \ + -reg (torch.Tensor): 2D regression value with the shape of [B, 2, H, W]. - -height (torch.Tensor): Height value with the \ + -height (torch.Tensor): Height value with the shape of [B, 1, H, W]. - -dim (torch.Tensor): Size value with the shape \ + -dim (torch.Tensor): Size value with the shape of [B, 3, H, W]. - -rot (torch.Tensor): Rotation value with the \ + -rot (torch.Tensor): Rotation value with the shape of [B, 2, H, W]. - -vel (torch.Tensor): Velocity value with the \ + -vel (torch.Tensor): Velocity value with the shape of [B, 2, H, W]. - -heatmap (torch.Tensor): Heatmap with the shape of \ + -heatmap (torch.Tensor): Heatmap with the shape of [B, N, H, W]. """ ret_dict = dict() @@ -131,18 +132,19 @@ class DCNSeparateHead(BaseModule): Args: in_channels (int): Input channels for conv_layer. + num_cls (int): Number of classes. heads (dict): Conv information. dcn_config (dict): Config of dcn layer. - num_cls (int): Output channels. + head_conv (int, optional): Output channels. Default: 64. - final_kernal (int): Kernal size for the last conv layer. - Deafult: 1. - init_bias (float): Initial bias. Default: -2.19. - conv_cfg (dict): Config of conv layer. + final_kernel (int, optional): Kernel size for the last conv + layer. Default: 1. + init_bias (float, optional): Initial bias. Default: -2.19. + conv_cfg (dict, optional): Config of conv layer. Default: dict(type='Conv2d') - norm_cfg (dict): Config of norm layer. + norm_cfg (dict, optional): Config of norm layer. Default: dict(type='BN2d'). - bias (str): Type of bias. Default: 'auto'. + bias (str, optional): Type of bias. Default: 'auto'. """ # noqa: W605 def __init__(self, @@ -215,17 +217,17 @@ def forward(self, x): Returns: dict[str: torch.Tensor]: contains the following keys: - -reg (torch.Tensor): 2D regression value with the \ + -reg (torch.Tensor): 2D regression value with the shape of [B, 2, H, W]. - -height (torch.Tensor): Height value with the \ + -height (torch.Tensor): Height value with the shape of [B, 1, H, W]. - -dim (torch.Tensor): Size value with the shape \ + -dim (torch.Tensor): Size value with the shape of [B, 3, H, W]. - -rot (torch.Tensor): Rotation value with the \ + -rot (torch.Tensor): Rotation value with the shape of [B, 2, H, W]. - -vel (torch.Tensor): Velocity value with the \ + -vel (torch.Tensor): Velocity value with the shape of [B, 2, H, W]. - -heatmap (torch.Tensor): Heatmap with the shape of \ + -heatmap (torch.Tensor): Heatmap with the shape of [B, N, H, W]. """ center_feat = self.feature_adapt_cls(x) @@ -243,31 +245,30 @@ class CenterHead(BaseModule): """CenterHead for CenterPoint. Args: - mode (str): Mode of the head. Default: '3d'. - in_channels (list[int] | int): Channels of the input feature map. - Default: [128]. - tasks (list[dict]): Task information including class number + in_channels (list[int] | int, optional): Channels of the input + feature map. Default: [128]. + tasks (list[dict], optional): Task information including class number and class names. Default: None. - dataset (str): Name of the dataset. Default: 'nuscenes'. - weight (float): Weight for location loss. Default: 0.25. - code_weights (list[int]): Code weights for location loss. Default: []. - common_heads (dict): Conv information for common heads. + train_cfg (dict, optional): Train-time configs. Default: None. + test_cfg (dict, optional): Test-time configs. Default: None. + bbox_coder (dict, optional): Bbox coder configs. Default: None. + common_heads (dict, optional): Conv information for common heads. Default: dict(). - loss_cls (dict): Config of classification loss function. + loss_cls (dict, optional): Config of classification loss function. Default: dict(type='GaussianFocalLoss', reduction='mean'). - loss_bbox (dict): Config of regression loss function. + loss_bbox (dict, optional): Config of regression loss function. Default: dict(type='L1Loss', reduction='none'). - separate_head (dict): Config of separate head. Default: dict( + separate_head (dict, optional): Config of separate head. Default: dict( type='SeparateHead', init_bias=-2.19, final_kernel=3) - share_conv_channel (int): Output channels for share_conv_layer. - Default: 64. - num_heatmap_convs (int): Number of conv layers for heatmap conv layer. - Default: 2. - conv_cfg (dict): Config of conv layer. + share_conv_channel (int, optional): Output channels for share_conv + layer. Default: 64. + num_heatmap_convs (int, optional): Number of conv layers for heatmap + conv layer. Default: 2. + conv_cfg (dict, optional): Config of conv layer. Default: dict(type='Conv2d') - norm_cfg (dict): Config of norm layer. + norm_cfg (dict, optional): Config of norm layer. Default: dict(type='BN2d'). - bias (str): Type of bias. Default: 'auto'. + bias (str, optional): Type of bias. Default: 'auto'. """ def __init__(self, @@ -366,8 +367,8 @@ def _gather_feat(self, feat, ind, mask=None): feat (torch.tensor): Feature map with the shape of [B, H*W, 10]. ind (torch.Tensor): Index of the ground truth boxes with the shape of [B, max_obj]. - mask (torch.Tensor): Mask of the feature map with the shape - of [B, max_obj]. Default: None. + mask (torch.Tensor, optional): Mask of the feature map with the + shape of [B, max_obj]. Default: None. Returns: torch.Tensor: Feature map after gathering with the shape @@ -403,14 +404,14 @@ def get_targets(self, gt_bboxes_3d, gt_labels_3d): Returns: Returns: - tuple[list[torch.Tensor]]: Tuple of target including \ + tuple[list[torch.Tensor]]: Tuple of target including the following results in order. - list[torch.Tensor]: Heatmap scores. - list[torch.Tensor]: Ground truth boxes. - - list[torch.Tensor]: Indexes indicating the \ + - list[torch.Tensor]: Indexes indicating the position of the valid boxes. - - list[torch.Tensor]: Masks indicating which \ + - list[torch.Tensor]: Masks indicating which boxes are valid. """ heatmaps, anno_boxes, inds, masks = multi_apply( @@ -437,14 +438,14 @@ def get_targets_single(self, gt_bboxes_3d, gt_labels_3d): gt_labels_3d (torch.Tensor): Labels of boxes. Returns: - tuple[list[torch.Tensor]]: Tuple of target including \ + tuple[list[torch.Tensor]]: Tuple of target including the following results in order. - list[torch.Tensor]: Heatmap scores. - list[torch.Tensor]: Ground truth boxes. - - list[torch.Tensor]: Indexes indicating the position \ + - list[torch.Tensor]: Indexes indicating the position of the valid boxes. - - list[torch.Tensor]: Masks indicating which boxes \ + - list[torch.Tensor]: Masks indicating which boxes are valid. """ device = gt_labels_3d.device @@ -728,11 +729,11 @@ def get_task_detections(self, num_class_with_bg, batch_cls_preds, Returns: list[dict[str: torch.Tensor]]: contains the following keys: - -bboxes (torch.Tensor): Prediction bboxes after nms with the \ + -bboxes (torch.Tensor): Prediction bboxes after nms with the shape of [N, 9]. - -scores (torch.Tensor): Prediction scores after nms with the \ + -scores (torch.Tensor): Prediction scores after nms with the shape of [N]. - -labels (torch.Tensor): Prediction labels after nms with the \ + -labels (torch.Tensor): Prediction labels after nms with the shape of [N]. """ predictions_dicts = [] @@ -781,7 +782,7 @@ def get_task_detections(self, num_class_with_bg, batch_cls_preds, boxes_for_nms, top_scores, thresh=self.test_cfg['nms_thr'], - pre_maxsize=self.test_cfg['pre_max_size'], + pre_max_size=self.test_cfg['pre_max_size'], post_max_size=self.test_cfg['post_max_size']) else: selected = [] diff --git a/mmdet3d/models/dense_heads/fcos_mono3d_head.py b/mmdet3d/models/dense_heads/fcos_mono3d_head.py index 08e64d6a49..fb43682a4c 100644 --- a/mmdet3d/models/dense_heads/fcos_mono3d_head.py +++ b/mmdet3d/models/dense_heads/fcos_mono3d_head.py @@ -1,12 +1,16 @@ # Copyright (c) OpenMMLab. All rights reserved. +from logging import warning + import numpy as np import torch -from mmcv.cnn import Scale +from mmcv.cnn import Scale, normal_init from mmcv.runner import force_fp32 from torch import nn as nn -from mmdet3d.core import box3d_multiclass_nms, limit_period, xywhr2xyxyr +from mmdet3d.core import (box3d_multiclass_nms, limit_period, points_img2cam, + xywhr2xyxyr) from mmdet.core import multi_apply +from mmdet.core.bbox.builder import build_bbox_coder from mmdet.models.builder import HEADS, build_loss from .anchor_free_mono3d_head import AnchorFreeMono3DHead @@ -21,31 +25,29 @@ class FCOSMono3DHead(AnchorFreeMono3DHead): num_classes (int): Number of categories excluding the background category. in_channels (int): Number of channels in the input feature map. - regress_ranges (tuple[tuple[int, int]]): Regress range of multiple + regress_ranges (tuple[tuple[int, int]], optional): Regress range of multiple level points. - center_sampling (bool): If true, use center sampling. Default: True. - center_sample_radius (float): Radius of center sampling. Default: 1.5. - norm_on_bbox (bool): If true, normalize the regression targets + center_sampling (bool, optional): If true, use center sampling. Default: True. + center_sample_radius (float, optional): Radius of center sampling. Default: 1.5. + norm_on_bbox (bool, optional): If true, normalize the regression targets with FPN strides. Default: True. - centerness_on_reg (bool): If true, position centerness on the + centerness_on_reg (bool, optional): If true, position centerness on the regress branch. Please refer to https://github.com/tianzhi0549/FCOS/issues/89#issuecomment-516877042. Default: True. - centerness_alpha: Parameter used to adjust the intensity attenuation - from the center to the periphery. Default: 2.5. - loss_cls (dict): Config of classification loss. - loss_bbox (dict): Config of localization loss. - loss_dir (dict): Config of direction classification loss. - loss_attr (dict): Config of attribute classification loss. - loss_centerness (dict): Config of centerness loss. - norm_cfg (dict): dictionary to construct and config norm layer. + centerness_alpha (int, optional): Parameter used to adjust the intensity + attenuation from the center to the periphery. Default: 2.5. + loss_cls (dict, optional): Config of classification loss. + loss_bbox (dict, optional): Config of localization loss. + loss_dir (dict, optional): Config of direction classification loss. + loss_attr (dict, optional): Config of attribute classification loss. + loss_centerness (dict, optional): Config of centerness loss. + norm_cfg (dict, optional): dictionary to construct and config norm layer. Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True). - centerness_branch (tuple[int]): Channels for centerness branch. + centerness_branch (tuple[int], optional): Channels for centerness branch. Default: (64, ). """ # noqa: E501 def __init__(self, - num_classes, - in_channels, regress_ranges=((-1, 48), (48, 96), (96, 192), (192, 384), (384, INF)), center_sampling=True, @@ -73,6 +75,7 @@ def __init__(self, type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + bbox_coder=dict(type='FCOS3DBBoxCoder', code_size=9), norm_cfg=dict(type='GN', num_groups=32, requires_grad=True), centerness_branch=(64, ), init_cfg=None, @@ -85,8 +88,6 @@ def __init__(self, self.centerness_alpha = centerness_alpha self.centerness_branch = centerness_branch super().__init__( - num_classes, - in_channels, loss_cls=loss_cls, loss_bbox=loss_bbox, loss_dir=loss_dir, @@ -95,13 +96,8 @@ def __init__(self, init_cfg=init_cfg, **kwargs) self.loss_centerness = build_loss(loss_centerness) - if init_cfg is None: - self.init_cfg = dict( - type='Normal', - layer='Conv2d', - std=0.01, - override=dict( - type='Normal', name='conv_cls', std=0.01, bias_prob=0.01)) + bbox_coder['code_size'] = self.bbox_code_size + self.bbox_coder = build_bbox_coder(bbox_coder) def _init_layers(self): """Initialize layers of the head.""" @@ -110,9 +106,24 @@ def _init_layers(self): conv_channels=self.centerness_branch, conv_strides=(1, ) * len(self.centerness_branch)) self.conv_centerness = nn.Conv2d(self.centerness_branch[-1], 1, 1) + self.scale_dim = 3 # only for offset, depth and size regression self.scales = nn.ModuleList([ - nn.ModuleList([Scale(1.0) for _ in range(3)]) for _ in self.strides - ]) # only for offset, depth and size regression + nn.ModuleList([Scale(1.0) for _ in range(self.scale_dim)]) + for _ in self.strides + ]) + + def init_weights(self): + """Initialize weights of the head. + + We currently still use the customized init_weights because the default + init of DCN triggered by the init_cfg will init conv_offset.weight, + which mistakenly affects the training stability. + """ + super().init_weights() + for m in self.conv_centerness_prev: + if isinstance(m.conv, nn.Conv2d): + normal_init(m.conv, std=0.01) + normal_init(self.conv_centerness, std=0.01) def forward(self, feats): """Forward features from the upstream network. @@ -138,11 +149,12 @@ def forward(self, feats): centernesses (list[Tensor]): Centerness for each scale level, each is a 4D-tensor, the channel number is num_points * 1. """ + # Note: we use [:5] to filter feats and only return predictions return multi_apply(self.forward_single, feats, self.scales, - self.strides) + self.strides)[:5] def forward_single(self, x, scale, stride): - """Forward features of a single scale levle. + """Forward features of a single scale level. Args: x (Tensor): FPN feature maps of the specified stride. @@ -153,7 +165,7 @@ def forward_single(self, x, scale, stride): is True. Returns: - tuple: scores for each class, bbox and direction class \ + tuple: scores for each class, bbox and direction class predictions, centerness predictions of input feature maps. """ cls_score, bbox_pred, dir_cls_pred, attr_pred, cls_feat, reg_feat = \ @@ -169,26 +181,12 @@ def forward_single(self, x, scale, stride): for conv_centerness_prev_layer in self.conv_centerness_prev: clone_cls_feat = conv_centerness_prev_layer(clone_cls_feat) centerness = self.conv_centerness(clone_cls_feat) - # scale the bbox_pred of different level - # only apply to offset, depth and size prediction - scale_offset, scale_depth, scale_size = scale[0:3] - clone_bbox_pred = bbox_pred.clone() - bbox_pred[:, :2] = scale_offset(clone_bbox_pred[:, :2]).float() - bbox_pred[:, 2] = scale_depth(clone_bbox_pred[:, 2]).float() - bbox_pred[:, 3:6] = scale_size(clone_bbox_pred[:, 3:6]).float() + bbox_pred = self.bbox_coder.decode(bbox_pred, scale, stride, + self.training, cls_score) - bbox_pred[:, 2] = bbox_pred[:, 2].exp() - bbox_pred[:, 3:6] = bbox_pred[:, 3:6].exp() + 1e-6 # avoid size=0 - - assert self.norm_on_bbox is True, 'Setting norm_on_bbox to False '\ - 'has not been thoroughly tested for FCOS3D.' - if self.norm_on_bbox: - if not self.training: - # Note that this line is conducted only when testing - bbox_pred[:, :2] *= stride - - return cls_score, bbox_pred, dir_cls_pred, attr_pred, centerness + return cls_score, bbox_pred, dir_cls_pred, attr_pred, centerness, \ + cls_feat, reg_feat @staticmethod def add_sin_difference(boxes1, boxes2): @@ -201,7 +199,7 @@ def add_sin_difference(boxes1, boxes2): the 7th dimension is rotation dimension. Returns: - tuple[torch.Tensor]: ``boxes1`` and ``boxes2`` whose 7th \ + tuple[torch.Tensor]: ``boxes1`` and ``boxes2`` whose 7th dimensions are changed. """ rad_pred_encoding = torch.sin(boxes1[..., 6:7]) * torch.cos( @@ -217,21 +215,27 @@ def add_sin_difference(boxes1, boxes2): @staticmethod def get_direction_target(reg_targets, dir_offset=0, + dir_limit_offset=0.0, num_bins=2, one_hot=True): """Encode direction to 0 ~ num_bins-1. Args: reg_targets (torch.Tensor): Bbox regression targets. - dir_offset (int): Direction offset. - num_bins (int): Number of bins to divide 2*PI. - one_hot (bool): Whether to encode as one hot. + dir_offset (int, optional): Direction offset. Default to 0. + dir_limit_offset (float, optional): Offset to set the direction + range. Default to 0.0. + num_bins (int, optional): Number of bins to divide 2*PI. + Default to 2. + one_hot (bool, optional): Whether to encode as one hot. + Default to True. Returns: torch.Tensor: Encoded direction targets. """ rot_gt = reg_targets[..., 6] - offset_rot = limit_period(rot_gt - dir_offset, 0, 2 * np.pi) + offset_rot = limit_period(rot_gt - dir_offset, dir_limit_offset, + 2 * np.pi) dir_cls_targets = torch.floor(offset_rot / (2 * np.pi / num_bins)).long() dir_cls_targets = torch.clamp(dir_cls_targets, min=0, max=num_bins - 1) @@ -293,7 +297,7 @@ def loss(self, attr_labels (list[Tensor]): Attributes indices of each box. img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. - gt_bboxes_ignore (None | list[Tensor]): specify which bounding + gt_bboxes_ignore (list[Tensor]): specify which bounding boxes can be ignored when computing the loss. Returns: @@ -377,7 +381,10 @@ def loss(self, if self.use_direction_classifier: pos_dir_cls_targets = self.get_direction_target( - pos_bbox_targets_3d, self.dir_offset, one_hot=False) + pos_bbox_targets_3d, + self.dir_offset, + self.dir_limit_offset, + one_hot=False) if self.diff_rad_by_sin: pos_bbox_preds, pos_bbox_targets_3d = self.add_sin_difference( @@ -502,11 +509,11 @@ def get_bboxes(self, rescale (bool): If True, return boxes in original image space Returns: - list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple. \ - The first item is an (n, 5) tensor, where the first 4 columns \ - are bounding box positions (tl_x, tl_y, br_x, br_y) and the \ - 5-th column is a score between 0 and 1. The second item is a \ - (n,) tensor where each item is the predicted class label of \ + list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple. + The first item is an (n, 5) tensor, where the first 4 columns + are bounding box positions (tl_x, tl_y, br_x, br_y) and the + 5-th column is a score between 0 and 1. The second item is a + (n,) tensor where each item is the predicted class label of the corresponding box. """ assert len(cls_scores) == len(bbox_preds) == len(dir_cls_preds) == \ @@ -575,7 +582,7 @@ def _get_bboxes_single(self, bbox_preds (list[Tensor]): Box energies / deltas for a single scale level with shape (num_points * bbox_code_size, H, W). dir_cls_preds (list[Tensor]): Box scores for direction class - predictions on a single scale level with shape \ + predictions on a single scale level with shape (num_points * 2, H, W) attr_preds (list[Tensor]): Attribute scores for each scale level Has shape (N, num_points * num_attrs, H, W) @@ -634,7 +641,7 @@ def _get_bboxes_single(self, if rescale: bbox_pred[:, :2] /= bbox_pred[:, :2].new_tensor(scale_factor) pred_center2d = bbox_pred[:, :3].clone() - bbox_pred[:, :3] = self.pts2Dto3D(bbox_pred[:, :3], view) + bbox_pred[:, :3] = points_img2cam(bbox_pred[:, :3], view) mlvl_centers2d.append(pred_center2d) mlvl_bboxes.append(bbox_pred) mlvl_scores.append(scores) @@ -647,19 +654,13 @@ def _get_bboxes_single(self, mlvl_dir_scores = torch.cat(mlvl_dir_scores) # change local yaw to global yaw for 3D nms - if mlvl_bboxes.shape[0] > 0: - dir_rot = limit_period(mlvl_bboxes[..., 6] - self.dir_offset, 0, - np.pi) - mlvl_bboxes[..., 6] = ( - dir_rot + self.dir_offset + - np.pi * mlvl_dir_scores.to(mlvl_bboxes.dtype)) - - cam_intrinsic = mlvl_centers2d.new_zeros((4, 4)) - cam_intrinsic[:view.shape[0], :view.shape[1]] = \ + cam2img = mlvl_centers2d.new_zeros((4, 4)) + cam2img[:view.shape[0], :view.shape[1]] = \ mlvl_centers2d.new_tensor(view) - mlvl_bboxes[:, 6] = torch.atan2( - mlvl_centers2d[:, 0] - cam_intrinsic[0, 2], - cam_intrinsic[0, 0]) + mlvl_bboxes[:, 6] + mlvl_bboxes = self.bbox_coder.decode_yaw(mlvl_bboxes, mlvl_centers2d, + mlvl_dir_scores, + self.dir_offset, cam2img) + mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d']( mlvl_bboxes, box_dim=self.bbox_code_size, origin=(0.5, 0.5, 0.5)).bev) @@ -695,14 +696,18 @@ def _get_bboxes_single(self, def pts2Dto3D(points, view): """ Args: - points (torch.Tensor): points in 2D images, [N, 3], \ + points (torch.Tensor): points in 2D images, [N, 3], 3 corresponds with x, y in the image and depth. - view (np.ndarray): camera instrinsic, [3, 3] + view (np.ndarray): camera intrinsic, [3, 3] Returns: - torch.Tensor: points in 3D space. [N, 3], \ + torch.Tensor: points in 3D space. [N, 3], 3 corresponds with x, y, z in 3D space. """ + warning.warn('DeprecationWarning: This static method has been moved ' + 'out of this class to mmdet3d/core. The function ' + 'pts2Dto3D will be deprecated.') + assert view.shape[0] <= 4 assert view.shape[1] <= 4 assert points.shape[1] == 3 @@ -715,7 +720,7 @@ def pts2Dto3D(points, view): viewpad[:view.shape[0], :view.shape[1]] = points2D.new_tensor(view) inv_viewpad = torch.inverse(viewpad).transpose(0, 1) - # Do operation in homogenous coordinates. + # Do operation in homogeneous coordinates. nbr_points = unnorm_points2D.shape[0] homo_points2D = torch.cat( [unnorm_points2D, @@ -762,8 +767,8 @@ def get_targets(self, points, gt_bboxes_list, gt_labels_list, Returns: tuple: - concat_lvl_labels (list[Tensor]): Labels of each level. \ - concat_lvl_bbox_targets (list[Tensor]): BBox targets of each \ + concat_lvl_labels (list[Tensor]): Labels of each level. + concat_lvl_bbox_targets (list[Tensor]): BBox targets of each level. """ assert len(points) == len(self.regress_ranges) diff --git a/mmdet3d/models/dense_heads/free_anchor3d_head.py b/mmdet3d/models/dense_heads/free_anchor3d_head.py index cda1142f19..a7f593c0c1 100644 --- a/mmdet3d/models/dense_heads/free_anchor3d_head.py +++ b/mmdet3d/models/dense_heads/free_anchor3d_head.py @@ -195,6 +195,7 @@ def loss(self, matched_anchors, matched_object_targets, self.dir_offset, + self.dir_limit_offset, one_hot=False) loss_dir = self.loss_dir( dir_cls_preds_[matched].transpose(-2, -1), diff --git a/mmdet3d/models/dense_heads/groupfree3d_head.py b/mmdet3d/models/dense_heads/groupfree3d_head.py index 454c5df4d2..dcc3d08641 100644 --- a/mmdet3d/models/dense_heads/groupfree3d_head.py +++ b/mmdet3d/models/dense_heads/groupfree3d_head.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import copy + import numpy as np import torch from mmcv import ConfigDict @@ -25,13 +26,13 @@ class PointsObjClsModule(BaseModule): Args: in_channel (int): number of channels of seed point features. - num_convs (int): number of conv layers. + num_convs (int, optional): number of conv layers. Default: 3. - conv_cfg (dict): Config of convolution. + conv_cfg (dict, optional): Config of convolution. Default: dict(type='Conv1d'). - norm_cfg (dict): Config of normalization. + norm_cfg (dict, optional): Config of normalization. Default: dict(type='BN1d'). - act_cfg (dict): Config of activation. + act_cfg (dict, optional): Config of activation. Default: dict(type='ReLU'). """ @@ -299,7 +300,7 @@ def forward(self, feat_dict, sample_mod): """Forward pass. Note: - The forward of GroupFree3DHead is devided into 2 steps: + The forward of GroupFree3DHead is divided into 2 steps: 1. Initial object candidates sampling. 2. Iterative object box prediction by transformer decoder. @@ -405,15 +406,15 @@ def loss(self, Args: bbox_preds (dict): Predictions from forward of vote head. points (list[torch.Tensor]): Input points. - gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \ + gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth bboxes of each sample. gt_labels_3d (list[torch.Tensor]): Labels of each sample. - pts_semantic_mask (None | list[torch.Tensor]): Point-wise + pts_semantic_mask (list[torch.Tensor]): Point-wise semantic mask. - pts_instance_mask (None | list[torch.Tensor]): Point-wise + pts_instance_mask (list[torch.Tensor]): Point-wise instance mask. img_metas (list[dict]): Contain pcd and img's meta info. - gt_bboxes_ignore (None | list[torch.Tensor]): Specify + gt_bboxes_ignore (list[torch.Tensor]): Specify which bounding. ret_target (Bool): Return targets or not. @@ -545,12 +546,12 @@ def get_targets(self, Args: points (list[torch.Tensor]): Points of each batch. - gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \ + gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth bboxes of each batch. gt_labels_3d (list[torch.Tensor]): Labels of each batch. - pts_semantic_mask (None | list[torch.Tensor]): Point-wise semantic + pts_semantic_mask (list[torch.Tensor]): Point-wise semantic label of each batch. - pts_instance_mask (None | list[torch.Tensor]): Point-wise instance + pts_instance_mask (list[torch.Tensor]): Point-wise instance label of each batch. bbox_preds (torch.Tensor): Bounding box predictions of vote head. max_gt_num (int): Max number of GTs for single batch. @@ -657,12 +658,12 @@ def get_targets_single(self, Args: points (torch.Tensor): Points of each batch. - gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth \ + gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth boxes of each batch. gt_labels_3d (torch.Tensor): Labels of each batch. - pts_semantic_mask (None | torch.Tensor): Point-wise semantic + pts_semantic_mask (torch.Tensor): Point-wise semantic label of each batch. - pts_instance_mask (None | torch.Tensor): Point-wise instance + pts_instance_mask (torch.Tensor): Point-wise instance label of each batch. max_gt_nums (int): Max number of GTs for single batch. seed_points (torch.Tensor): Coordinates of seed points. @@ -710,7 +711,7 @@ def get_targets_single(self, if self.bbox_coder.with_rot: vote_targets = points.new_zeros([num_points, 4 * self.gt_per_seed]) vote_target_idx = points.new_zeros([num_points], dtype=torch.long) - box_indices_all = gt_bboxes_3d.points_in_boxes(points) + box_indices_all = gt_bboxes_3d.points_in_boxes_part(points) for i in range(gt_labels_3d.shape[0]): box_indices = box_indices_all[:, i] indices = torch.nonzero( @@ -880,7 +881,7 @@ def get_bboxes(self, Returns: list[tuple[torch.Tensor]]: Bounding boxes, scores and labels. """ - # support multi-stage predicitons + # support multi-stage predictions assert self.test_cfg['prediction_stages'] in \ ['last', 'all', 'last_three'] @@ -951,7 +952,7 @@ def multiclass_nms_single(self, obj_scores, sem_scores, bbox, points, box_dim=bbox.shape[-1], with_yaw=self.bbox_coder.with_rot, origin=(0.5, 0.5, 0.5)) - box_indices = bbox.points_in_boxes(points) + box_indices = bbox.points_in_boxes_all(points) corner3d = bbox.corners minmax_box3d = corner3d.new(torch.Size((corner3d.shape[0], 6))) diff --git a/mmdet3d/models/dense_heads/monoflex_head.py b/mmdet3d/models/dense_heads/monoflex_head.py new file mode 100644 index 0000000000..d7db76761e --- /dev/null +++ b/mmdet3d/models/dense_heads/monoflex_head.py @@ -0,0 +1,771 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from mmcv.cnn import xavier_init +from torch import nn as nn + +from mmdet3d.core.utils import get_ellip_gaussian_2D +from mmdet3d.models.model_utils import EdgeFusionModule +from mmdet3d.models.utils import (filter_outside_objs, get_edge_indices, + get_keypoints, handle_proj_objs) +from mmdet.core import multi_apply +from mmdet.core.bbox.builder import build_bbox_coder +from mmdet.models.builder import HEADS, build_loss +from mmdet.models.utils import gaussian_radius, gen_gaussian_target +from mmdet.models.utils.gaussian_target import (get_local_maximum, + get_topk_from_heatmap, + transpose_and_gather_feat) +from .anchor_free_mono3d_head import AnchorFreeMono3DHead + + +@HEADS.register_module() +class MonoFlexHead(AnchorFreeMono3DHead): + r"""MonoFlex head used in `MonoFlex `_ + + .. code-block:: none + + / --> 3 x 3 conv --> 1 x 1 conv --> [edge fusion] --> cls + | + | --> 3 x 3 conv --> 1 x 1 conv --> 2d bbox + | + | --> 3 x 3 conv --> 1 x 1 conv --> [edge fusion] --> 2d offsets + | + | --> 3 x 3 conv --> 1 x 1 conv --> keypoints offsets + | + | --> 3 x 3 conv --> 1 x 1 conv --> keypoints uncertainty + feature + | --> 3 x 3 conv --> 1 x 1 conv --> keypoints uncertainty + | + | --> 3 x 3 conv --> 1 x 1 conv --> 3d dimensions + | + | |--- 1 x 1 conv --> ori cls + | --> 3 x 3 conv --| + | |--- 1 x 1 conv --> ori offsets + | + | --> 3 x 3 conv --> 1 x 1 conv --> depth + | + \ --> 3 x 3 conv --> 1 x 1 conv --> depth uncertainty + + Args: + use_edge_fusion (bool): Whether to use edge fusion module while + feature extraction. + edge_fusion_inds (list[tuple]): Indices of feature to use edge fusion. + edge_heatmap_ratio (float): Ratio of generating target heatmap. + filter_outside_objs (bool, optional): Whether to filter the + outside objects. Default: True. + loss_cls (dict, optional): Config of classification loss. + Default: loss_cls=dict(type='GaussionFocalLoss', loss_weight=1.0). + loss_bbox (dict, optional): Config of localization loss. + Default: loss_bbox=dict(type='IOULoss', loss_weight=10.0). + loss_dir (dict, optional): Config of direction classification loss. + Default: dict(type='MultibinLoss', loss_weight=0.1). + loss_keypoints (dict, optional): Config of keypoints loss. + Default: dict(type='L1Loss', loss_weight=0.1). + loss_dims: (dict, optional): Config of dimensions loss. + Default: dict(type='L1Loss', loss_weight=0.1). + loss_offsets2d: (dict, optional): Config of offsets2d loss. + Default: dict(type='L1Loss', loss_weight=0.1). + loss_direct_depth: (dict, optional): Config of directly regression depth loss. + Default: dict(type='L1Loss', loss_weight=0.1). + loss_keypoints_depth: (dict, optional): Config of keypoints decoded depth loss. + Default: dict(type='L1Loss', loss_weight=0.1). + loss_combined_depth: (dict, optional): Config of combined depth loss. + Default: dict(type='L1Loss', loss_weight=0.1). + loss_attr (dict, optional): Config of attribute classification loss. + In MonoFlex, Default: None. + bbox_coder (dict, optional): Bbox coder for encoding and decoding boxes. + Default: dict(type='MonoFlexCoder', code_size=7). + norm_cfg (dict, optional): Dictionary to construct and config norm layer. + Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True). + init_cfg (dict): Initialization config dict. Default: None. + """ # noqa: E501 + + def __init__(self, + num_classes, + in_channels, + use_edge_fusion, + edge_fusion_inds, + edge_heatmap_ratio, + filter_outside_objs=True, + loss_cls=dict(type='GaussianFocalLoss', loss_weight=1.0), + loss_bbox=dict(type='IoULoss', loss_weight=0.1), + loss_dir=dict(type='MultiBinLoss', loss_weight=0.1), + loss_keypoints=dict(type='L1Loss', loss_weight=0.1), + loss_dims=dict(type='L1Loss', loss_weight=0.1), + loss_offsets2d=dict(type='L1Loss', loss_weight=0.1), + loss_direct_depth=dict(type='L1Loss', loss_weight=0.1), + loss_keypoints_depth=dict(type='L1Loss', loss_weight=0.1), + loss_combined_depth=dict(type='L1Loss', loss_weight=0.1), + loss_attr=None, + bbox_coder=dict(type='MonoFlexCoder', code_size=7), + norm_cfg=dict(type='BN'), + init_cfg=None, + init_bias=-2.19, + **kwargs): + self.use_edge_fusion = use_edge_fusion + self.edge_fusion_inds = edge_fusion_inds + super().__init__( + num_classes, + in_channels, + loss_cls=loss_cls, + loss_bbox=loss_bbox, + loss_dir=loss_dir, + loss_attr=loss_attr, + norm_cfg=norm_cfg, + init_cfg=init_cfg, + **kwargs) + self.filter_outside_objs = filter_outside_objs + self.edge_heatmap_ratio = edge_heatmap_ratio + self.init_bias = init_bias + self.loss_dir = build_loss(loss_dir) + self.loss_keypoints = build_loss(loss_keypoints) + self.loss_dims = build_loss(loss_dims) + self.loss_offsets2d = build_loss(loss_offsets2d) + self.loss_direct_depth = build_loss(loss_direct_depth) + self.loss_keypoints_depth = build_loss(loss_keypoints_depth) + self.loss_combined_depth = build_loss(loss_combined_depth) + self.bbox_coder = build_bbox_coder(bbox_coder) + + def _init_edge_module(self): + """Initialize edge fusion module for feature extraction.""" + self.edge_fuse_cls = EdgeFusionModule(self.num_classes, 256) + for i in range(len(self.edge_fusion_inds)): + reg_inds, out_inds = self.edge_fusion_inds[i] + out_channels = self.group_reg_dims[reg_inds][out_inds] + fusion_layer = EdgeFusionModule(out_channels, 256) + layer_name = f'edge_fuse_reg_{reg_inds}_{out_inds}' + self.add_module(layer_name, fusion_layer) + + def init_weights(self): + """Initialize weights.""" + super().init_weights() + self.conv_cls.bias.data.fill_(self.init_bias) + xavier_init(self.conv_regs[4][0], gain=0.01) + xavier_init(self.conv_regs[7][0], gain=0.01) + for m in self.conv_regs.modules(): + if isinstance(m, nn.Conv2d): + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def _init_predictor(self): + """Initialize predictor layers of the head.""" + self.conv_cls_prev = self._init_branch( + conv_channels=self.cls_branch, + conv_strides=(1, ) * len(self.cls_branch)) + self.conv_cls = nn.Conv2d(self.cls_branch[-1], self.cls_out_channels, + 1) + # init regression head + self.conv_reg_prevs = nn.ModuleList() + # init output head + self.conv_regs = nn.ModuleList() + # group_reg_dims: + # ((4, ), (2, ), (20, ), (3, ), (3, ), (8, 8), (1, ), (1, )) + for i in range(len(self.group_reg_dims)): + reg_dims = self.group_reg_dims[i] + reg_branch_channels = self.reg_branch[i] + out_channel = self.out_channels[i] + reg_list = nn.ModuleList() + if len(reg_branch_channels) > 0: + self.conv_reg_prevs.append( + self._init_branch( + conv_channels=reg_branch_channels, + conv_strides=(1, ) * len(reg_branch_channels))) + for reg_dim in reg_dims: + reg_list.append(nn.Conv2d(out_channel, reg_dim, 1)) + self.conv_regs.append(reg_list) + else: + self.conv_reg_prevs.append(None) + for reg_dim in reg_dims: + reg_list.append(nn.Conv2d(self.feat_channels, reg_dim, 1)) + self.conv_regs.append(reg_list) + + def _init_layers(self): + """Initialize layers of the head.""" + self._init_predictor() + if self.use_edge_fusion: + self._init_edge_module() + + def forward_train(self, x, input_metas, gt_bboxes, gt_labels, gt_bboxes_3d, + gt_labels_3d, centers2d, depths, attr_labels, + gt_bboxes_ignore, proposal_cfg, **kwargs): + """ + Args: + x (list[Tensor]): Features from FPN. + input_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + gt_bboxes (list[Tensor]): Ground truth bboxes of the image, + shape (num_gts, 4). + gt_labels (list[Tensor]): Ground truth labels of each box, + shape (num_gts,). + gt_bboxes_3d (list[Tensor]): 3D ground truth bboxes of the image, + shape (num_gts, self.bbox_code_size). + gt_labels_3d (list[Tensor]): 3D ground truth labels of each box, + shape (num_gts,). + centers2d (list[Tensor]): Projected 3D center of each box, + shape (num_gts, 2). + depths (list[Tensor]): Depth of projected 3D center of each box, + shape (num_gts,). + attr_labels (list[Tensor]): Attribute labels of each box, + shape (num_gts,). + gt_bboxes_ignore (list[Tensor]): Ground truth bboxes to be + ignored, shape (num_ignored_gts, 4). + proposal_cfg (mmcv.Config): Test / postprocessing configuration, + if None, test_cfg would be used + Returns: + tuple: + losses: (dict[str, Tensor]): A dictionary of loss components. + proposal_list (list[Tensor]): Proposals of each image. + """ + outs = self(x, input_metas) + if gt_labels is None: + loss_inputs = outs + (gt_bboxes, gt_bboxes_3d, centers2d, depths, + attr_labels, input_metas) + else: + loss_inputs = outs + (gt_bboxes, gt_labels, gt_bboxes_3d, + gt_labels_3d, centers2d, depths, attr_labels, + input_metas) + losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) + if proposal_cfg is None: + return losses + else: + proposal_list = self.get_bboxes( + *outs, input_metas, cfg=proposal_cfg) + return losses, proposal_list + + def forward(self, feats, input_metas): + """Forward features from the upstream network. + + Args: + feats (list[Tensor]): Features from the upstream network, each is + a 4D-tensor. + input_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + + Returns: + tuple: + cls_scores (list[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_points * num_classes. + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_points * bbox_code_size. + """ + mlvl_input_metas = [input_metas for i in range(len(feats))] + return multi_apply(self.forward_single, feats, mlvl_input_metas) + + def forward_single(self, x, input_metas): + """Forward features of a single scale level. + + Args: + x (Tensor): Feature maps from a specific FPN feature level. + input_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + + Returns: + tuple: Scores for each class, bbox predictions. + """ + img_h, img_w = input_metas[0]['pad_shape'][:2] + batch_size, _, feat_h, feat_w = x.shape + downsample_ratio = img_h / feat_h + + for conv_cls_prev_layer in self.conv_cls_prev: + cls_feat = conv_cls_prev_layer(x) + out_cls = self.conv_cls(cls_feat) + + if self.use_edge_fusion: + # calculate the edge indices for the batch data + edge_indices_list = get_edge_indices( + input_metas, downsample_ratio, device=x.device) + edge_lens = [ + edge_indices.shape[0] for edge_indices in edge_indices_list + ] + max_edge_len = max(edge_lens) + edge_indices = x.new_zeros((batch_size, max_edge_len, 2), + dtype=torch.long) + for i in range(batch_size): + edge_indices[i, :edge_lens[i]] = edge_indices_list[i] + # cls feature map edge fusion + out_cls = self.edge_fuse_cls(cls_feat, out_cls, edge_indices, + edge_lens, feat_h, feat_w) + + bbox_pred = [] + + for i in range(len(self.group_reg_dims)): + reg_feat = x.clone() + # feature regression head + if len(self.reg_branch[i]) > 0: + for conv_reg_prev_layer in self.conv_reg_prevs[i]: + reg_feat = conv_reg_prev_layer(reg_feat) + + for j, conv_reg in enumerate(self.conv_regs[i]): + out_reg = conv_reg(reg_feat) + # Use Edge Fusion Module + if self.use_edge_fusion and (i, j) in self.edge_fusion_inds: + # reg feature map edge fusion + out_reg = getattr(self, 'edge_fuse_reg_{}_{}'.format( + i, j))(reg_feat, out_reg, edge_indices, edge_lens, + feat_h, feat_w) + bbox_pred.append(out_reg) + + bbox_pred = torch.cat(bbox_pred, dim=1) + cls_score = out_cls.sigmoid() # turn to 0-1 + cls_score = cls_score.clamp(min=1e-4, max=1 - 1e-4) + + return cls_score, bbox_pred + + def get_bboxes(self, cls_scores, bbox_preds, input_metas): + """Generate bboxes from bbox head predictions. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level. + bbox_preds (list[Tensor]): Box regression for each scale. + input_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + rescale (bool): If True, return boxes in original image space. + Returns: + list[tuple[:obj:`CameraInstance3DBoxes`, Tensor, Tensor, None]]: + Each item in result_list is 4-tuple. + """ + assert len(cls_scores) == len(bbox_preds) == 1 + cam2imgs = torch.stack([ + cls_scores[0].new_tensor(input_meta['cam2img']) + for input_meta in input_metas + ]) + batch_bboxes, batch_scores, batch_topk_labels = self.decode_heatmap( + cls_scores[0], + bbox_preds[0], + input_metas, + cam2imgs=cam2imgs, + topk=100, + kernel=3) + + result_list = [] + for img_id in range(len(input_metas)): + + bboxes = batch_bboxes[img_id] + scores = batch_scores[img_id] + labels = batch_topk_labels[img_id] + + keep_idx = scores > 0.25 + bboxes = bboxes[keep_idx] + scores = scores[keep_idx] + labels = labels[keep_idx] + + bboxes = input_metas[img_id]['box_type_3d']( + bboxes, box_dim=self.bbox_code_size, origin=(0.5, 0.5, 0.5)) + attrs = None + result_list.append((bboxes, scores, labels, attrs)) + + return result_list + + def decode_heatmap(self, + cls_score, + reg_pred, + input_metas, + cam2imgs, + topk=100, + kernel=3): + """Transform outputs into detections raw bbox predictions. + + Args: + class_score (Tensor): Center predict heatmap, + shape (B, num_classes, H, W). + reg_pred (Tensor): Box regression map. + shape (B, channel, H , W). + input_metas (List[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + cam2imgs (Tensor): Camera intrinsic matrix. + shape (N, 4, 4) + topk (int, optional): Get top k center keypoints from heatmap. + Default 100. + kernel (int, optional): Max pooling kernel for extract local + maximum pixels. Default 3. + + Returns: + tuple[torch.Tensor]: Decoded output of SMOKEHead, containing + the following Tensors: + - batch_bboxes (Tensor): Coords of each 3D box. + shape (B, k, 7) + - batch_scores (Tensor): Scores of each 3D box. + shape (B, k) + - batch_topk_labels (Tensor): Categories of each 3D box. + shape (B, k) + """ + img_h, img_w = input_metas[0]['pad_shape'][:2] + batch_size, _, feat_h, feat_w = cls_score.shape + + downsample_ratio = img_h / feat_h + center_heatmap_pred = get_local_maximum(cls_score, kernel=kernel) + + *batch_dets, topk_ys, topk_xs = get_topk_from_heatmap( + center_heatmap_pred, k=topk) + batch_scores, batch_index, batch_topk_labels = batch_dets + + regression = transpose_and_gather_feat(reg_pred, batch_index) + regression = regression.view(-1, 8) + + pred_base_centers2d = torch.cat( + [topk_xs.view(-1, 1), + topk_ys.view(-1, 1).float()], dim=1) + preds = self.bbox_coder.decode(regression, batch_topk_labels, + downsample_ratio, cam2imgs) + pred_locations = self.bbox_coder.decode_location( + pred_base_centers2d, preds['offsets2d'], preds['combined_depth'], + cam2imgs, downsample_ratio) + pred_yaws = self.bbox_coder.decode_orientation( + preds['orientations']).unsqueeze(-1) + pred_dims = preds['dimensions'] + batch_bboxes = torch.cat((pred_locations, pred_dims, pred_yaws), dim=1) + batch_bboxes = batch_bboxes.view(batch_size, -1, self.bbox_code_size) + return batch_bboxes, batch_scores, batch_topk_labels + + def get_predictions(self, pred_reg, labels3d, centers2d, reg_mask, + batch_indices, input_metas, downsample_ratio): + """Prepare predictions for computing loss. + + Args: + pred_reg (Tensor): Box regression map. + shape (B, channel, H , W). + labels3d (Tensor): Labels of each 3D box. + shape (B * max_objs, ) + centers2d (Tensor): Coords of each projected 3D box + center on image. shape (N, 2) + reg_mask (Tensor): Indexes of the existence of the 3D box. + shape (B * max_objs, ) + batch_indices (Tenosr): Batch indices of the 3D box. + shape (N, 3) + input_metas (list[dict]): Meta information of each image, + e.g., image size, scaling factor, etc. + downsample_ratio (int): The stride of feature map. + + Returns: + dict: The predictions for computing loss. + """ + batch, channel = pred_reg.shape[0], pred_reg.shape[1] + w = pred_reg.shape[3] + cam2imgs = torch.stack([ + centers2d.new_tensor(input_meta['cam2img']) + for input_meta in input_metas + ]) + # (batch_size, 4, 4) -> (N, 4, 4) + cam2imgs = cam2imgs[batch_indices, :, :] + centers2d_inds = centers2d[:, 1] * w + centers2d[:, 0] + centers2d_inds = centers2d_inds.view(batch, -1) + pred_regression = transpose_and_gather_feat(pred_reg, centers2d_inds) + pred_regression_pois = pred_regression.view(-1, channel)[reg_mask] + preds = self.bbox_coder.decode(pred_regression_pois, labels3d, + downsample_ratio, cam2imgs) + + return preds + + def get_targets(self, gt_bboxes_list, gt_labels_list, gt_bboxes_3d_list, + gt_labels_3d_list, centers2d_list, depths_list, feat_shape, + img_shape, input_metas): + """Get training targets for batch images. +`` + Args: + gt_bboxes_list (list[Tensor]): Ground truth bboxes of each + image, shape (num_gt, 4). + gt_labels_list (list[Tensor]): Ground truth labels of each + box, shape (num_gt,). + gt_bboxes_3d_list (list[:obj:`CameraInstance3DBoxes`]): 3D + Ground truth bboxes of each image, + shape (num_gt, bbox_code_size). + gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of + each box, shape (num_gt,). + centers2d_list (list[Tensor]): Projected 3D centers onto 2D + image, shape (num_gt, 2). + depths_list (list[Tensor]): Depth of projected 3D centers onto 2D + image, each has shape (num_gt, 1). + feat_shape (tuple[int]): Feature map shape with value, + shape (B, _, H, W). + img_shape (tuple[int]): Image shape in [h, w] format. + input_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + + Returns: + tuple[Tensor, dict]: The Tensor value is the targets of + center heatmap, the dict has components below: + - base_centers2d_target (Tensor): Coords of each projected 3D box + center on image. shape (B * max_objs, 2), [dtype: int] + - labels3d (Tensor): Labels of each 3D box. + shape (N, ) + - reg_mask (Tensor): Mask of the existence of the 3D box. + shape (B * max_objs, ) + - batch_indices (Tensor): Batch id of the 3D box. + shape (N, ) + - depth_target (Tensor): Depth target of each 3D box. + shape (N, ) + - keypoints2d_target (Tensor): Keypoints of each projected 3D box + on image. shape (N, 10, 2) + - keypoints_mask (Tensor): Keypoints mask of each projected 3D + box on image. shape (N, 10) + - keypoints_depth_mask (Tensor): Depths decoded from keypoints + of each 3D box. shape (N, 3) + - orientations_target (Tensor): Orientation (encoded local yaw) + target of each 3D box. shape (N, ) + - offsets2d_target (Tensor): Offsets target of each projected + 3D box. shape (N, 2) + - dimensions_target (Tensor): Dimensions target of each 3D box. + shape (N, 3) + - downsample_ratio (int): The stride of feature map. + """ + + img_h, img_w = img_shape[:2] + batch_size, _, feat_h, feat_w = feat_shape + + width_ratio = float(feat_w / img_w) # 1/4 + height_ratio = float(feat_h / img_h) # 1/4 + + assert width_ratio == height_ratio + + # Whether to filter the objects which are not in FOV. + if self.filter_outside_objs: + filter_outside_objs(gt_bboxes_list, gt_labels_list, + gt_bboxes_3d_list, gt_labels_3d_list, + centers2d_list, input_metas) + + # transform centers2d to base centers2d for regression and + # heatmap generation. + # centers2d = int(base_centers2d) + offsets2d + base_centers2d_list, offsets2d_list, trunc_mask_list = \ + handle_proj_objs(centers2d_list, gt_bboxes_list, input_metas) + + keypoints2d_list, keypoints_mask_list, keypoints_depth_mask_list = \ + get_keypoints(gt_bboxes_3d_list, centers2d_list, input_metas) + + center_heatmap_target = gt_bboxes_list[-1].new_zeros( + [batch_size, self.num_classes, feat_h, feat_w]) + + for batch_id in range(batch_size): + # project gt_bboxes from input image to feat map + gt_bboxes = gt_bboxes_list[batch_id] * width_ratio + gt_labels = gt_labels_list[batch_id] + + # project base centers2d from input image to feat map + gt_base_centers2d = base_centers2d_list[batch_id] * width_ratio + trunc_masks = trunc_mask_list[batch_id] + + for j, base_center2d in enumerate(gt_base_centers2d): + if trunc_masks[j]: + # for outside objects, generate ellipse heatmap + base_center2d_x_int, base_center2d_y_int = \ + base_center2d.int() + scale_box_w = min(base_center2d_x_int - gt_bboxes[j][0], + gt_bboxes[j][2] - base_center2d_x_int) + scale_box_h = min(base_center2d_y_int - gt_bboxes[j][1], + gt_bboxes[j][3] - base_center2d_y_int) + radius_x = scale_box_w * self.edge_heatmap_ratio + radius_y = scale_box_h * self.edge_heatmap_ratio + radius_x, radius_y = max(0, int(radius_x)), max( + 0, int(radius_y)) + assert min(radius_x, radius_y) == 0 + ind = gt_labels[j] + get_ellip_gaussian_2D( + center_heatmap_target[batch_id, ind], + [base_center2d_x_int, base_center2d_y_int], radius_x, + radius_y) + else: + base_center2d_x_int, base_center2d_y_int = \ + base_center2d.int() + scale_box_h = (gt_bboxes[j][3] - gt_bboxes[j][1]) + scale_box_w = (gt_bboxes[j][2] - gt_bboxes[j][0]) + radius = gaussian_radius([scale_box_h, scale_box_w], + min_overlap=0.7) + radius = max(0, int(radius)) + ind = gt_labels[j] + gen_gaussian_target( + center_heatmap_target[batch_id, ind], + [base_center2d_x_int, base_center2d_y_int], radius) + + avg_factor = max(1, center_heatmap_target.eq(1).sum()) + num_ctrs = [centers2d.shape[0] for centers2d in centers2d_list] + max_objs = max(num_ctrs) + batch_indices = [ + centers2d_list[0].new_full((num_ctrs[i], ), i) + for i in range(batch_size) + ] + batch_indices = torch.cat(batch_indices, dim=0) + reg_mask = torch.zeros( + (batch_size, max_objs), + dtype=torch.bool).to(base_centers2d_list[0].device) + gt_bboxes_3d = input_metas['box_type_3d'].cat(gt_bboxes_3d_list) + gt_bboxes_3d = gt_bboxes_3d.to(base_centers2d_list[0].device) + + # encode original local yaw to multibin format + orienations_target = self.bbox_coder.encode(gt_bboxes_3d) + + batch_base_centers2d = base_centers2d_list[0].new_zeros( + (batch_size, max_objs, 2)) + + for i in range(batch_size): + reg_mask[i, :num_ctrs[i]] = 1 + batch_base_centers2d[i, :num_ctrs[i]] = base_centers2d_list[i] + + flatten_reg_mask = reg_mask.flatten() + + # transform base centers2d from input scale to output scale + batch_base_centers2d = batch_base_centers2d.view(-1, 2) * width_ratio + + dimensions_target = gt_bboxes_3d.tensor[:, 3:6] + labels_3d = torch.cat(gt_labels_3d_list) + keypoints2d_target = torch.cat(keypoints2d_list) + keypoints_mask = torch.cat(keypoints_mask_list) + keypoints_depth_mask = torch.cat(keypoints_depth_mask_list) + offsets2d_target = torch.cat(offsets2d_list) + bboxes2d = torch.cat(gt_bboxes_list) + + # transform FCOS style bbox into [x1, y1, x2, y2] format. + bboxes2d_target = torch.cat([bboxes2d[:, 0:2] * -1, bboxes2d[:, 2:]], + dim=-1) + depths = torch.cat(depths_list) + + target_labels = dict( + base_centers2d_target=batch_base_centers2d.int(), + labels3d=labels_3d, + reg_mask=flatten_reg_mask, + batch_indices=batch_indices, + bboxes2d_target=bboxes2d_target, + depth_target=depths, + keypoints2d_target=keypoints2d_target, + keypoints_mask=keypoints_mask, + keypoints_depth_mask=keypoints_depth_mask, + orienations_target=orienations_target, + offsets2d_target=offsets2d_target, + dimensions_target=dimensions_target, + downsample_ratio=1 / width_ratio) + + return center_heatmap_target, avg_factor, target_labels + + def loss(self, + cls_scores, + bbox_preds, + gt_bboxes, + gt_labels, + gt_bboxes_3d, + gt_labels_3d, + centers2d, + depths, + attr_labels, + input_metas, + gt_bboxes_ignore=None): + """Compute loss of the head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level. + shape (num_gt, 4). + bbox_preds (list[Tensor]): Box dims is a 4D-tensor, the channel + number is bbox_code_size. + shape (B, 7, H, W). + gt_bboxes (list[Tensor]): Ground truth bboxes for each image. + shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels (list[Tensor]): Class indices corresponding to each box. + shape (num_gts, ). + gt_bboxes_3d (list[:obj:`CameraInstance3DBoxes`]): 3D boxes ground + truth. it is the flipped gt_bboxes + gt_labels_3d (list[Tensor]): Same as gt_labels. + centers2d (list[Tensor]): 2D centers on the image. + shape (num_gts, 2). + depths (list[Tensor]): Depth ground truth. + shape (num_gts, ). + attr_labels (list[Tensor]): Attributes indices of each box. + In kitti it's None. + input_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + gt_bboxes_ignore (None | list[Tensor]): Specify which bounding + boxes can be ignored when computing the loss. + Default: None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + assert len(cls_scores) == len(bbox_preds) == 1 + assert attr_labels is None + assert gt_bboxes_ignore is None + center2d_heatmap = cls_scores[0] + pred_reg = bbox_preds[0] + + center2d_heatmap_target, avg_factor, target_labels = \ + self.get_targets(gt_bboxes, gt_labels, gt_bboxes_3d, + gt_labels_3d, centers2d, depths, + center2d_heatmap.shape, + input_metas[0]['pad_shape'], + input_metas) + + preds = self.get_predictions( + pred_reg=pred_reg, + labels3d=target_labels['labels3d'], + centers2d=target_labels['base_centers2d_target'], + reg_mask=target_labels['reg_mask'], + batch_indices=target_labels['batch_indices'], + input_metas=input_metas, + downsample_ratio=target_labels['downsample_ratio']) + + # heatmap loss + loss_cls = self.loss_cls( + center2d_heatmap, center2d_heatmap_target, avg_factor=avg_factor) + + # bbox2d regression loss + loss_bbox = self.loss_bbox(preds['bboxes2d'], + target_labels['bboxes2d_target']) + + # keypoints loss, the keypoints in predictions and target are all + # local coordinates. Check the mask dtype should be bool, not int + # or float to ensure the indexing is bool index + keypoints2d_mask = target_labels['keypoints2d_mask'] + loss_keypoints = self.loss_keypoints( + preds['keypoints2d'][keypoints2d_mask], + target_labels['keypoints2d_target'][keypoints2d_mask]) + + # orientations loss + loss_dir = self.loss_dir(preds['orientations'], + target_labels['orientations_target']) + + # dimensions loss + loss_dims = self.loss_dims(preds['dimensions'], + target_labels['dimensions_target']) + + # offsets for center heatmap + loss_offsets2d = self.loss_offsets2d(preds['offsets2d'], + target_labels['offsets2d_target']) + + # directly regressed depth loss with direct depth uncertainty loss + direct_depth_weights = torch.exp(-preds['direct_depth_uncertainty']) + loss_weight_1 = self.loss_direct_depth.loss_weight + loss_direct_depth = self.loss_direct_depth( + preds['direct_depth'], target_labels['depth_target'], + direct_depth_weights) + loss_uncertainty_1 =\ + preds['direct_depth_uncertainty'] * loss_weight_1 + loss_direct_depth = loss_direct_depth + loss_uncertainty_1.mean() + + # keypoints decoded depth loss with keypoints depth uncertainty loss + depth_mask = target_labels['keypoints_depth_mask'] + depth_target = target_labels['depth_target'].unsqueeze(-1).repeat(1, 3) + valid_keypoints_depth_uncertainty = preds[ + 'keypoints_depth_uncertainty'][depth_mask] + valid_keypoints_depth_weights = torch.exp( + -valid_keypoints_depth_uncertainty) + loss_keypoints_depth = self.loss_keypoint_depth( + preds['keypoints_depth'][depth_mask], depth_target[depth_mask], + valid_keypoints_depth_weights) + loss_weight_2 = self.loss_keypoints_depth.loss_weight + loss_uncertainty_2 =\ + valid_keypoints_depth_uncertainty * loss_weight_2 + loss_keypoints_depth = loss_keypoints_depth + loss_uncertainty_2.mean() + + # combined depth loss for optimiaze the uncertainty + loss_combined_depth = self.loss_combined_depth( + preds['combined_depth'], target_labels['depth_target']) + + loss_dict = dict( + loss_cls=loss_cls, + loss_bbox=loss_bbox, + loss_keypoints=loss_keypoints, + loss_dir=loss_dir, + loss_dims=loss_dims, + loss_offsets2d=loss_offsets2d, + loss_direct_depth=loss_direct_depth, + loss_keypoints_depth=loss_keypoints_depth, + loss_combined_depth=loss_combined_depth) + + return loss_dict diff --git a/mmdet3d/models/dense_heads/parta2_rpn_head.py b/mmdet3d/models/dense_heads/parta2_rpn_head.py index 258004c57b..a73ec981fd 100644 --- a/mmdet3d/models/dense_heads/parta2_rpn_head.py +++ b/mmdet3d/models/dense_heads/parta2_rpn_head.py @@ -60,15 +60,15 @@ def __init__(self, type='Anchor3DRangeGenerator', range=[0, -39.68, -1.78, 69.12, 39.68, -1.78], strides=[2], - sizes=[[1.6, 3.9, 1.56]], + sizes=[[3.9, 1.6, 1.56]], rotations=[0, 1.57], custom_values=[], reshape_out=False), assigner_per_size=False, assign_per_class=False, diff_rad_by_sin=True, - dir_offset=0, - dir_limit_offset=1, + dir_offset=-np.pi / 2, + dir_limit_offset=0, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), loss_cls=dict( type='CrossEntropyLoss', @@ -100,20 +100,20 @@ def loss(self, bbox_preds (list[torch.Tensor]): Multi-level bbox predictions. dir_cls_preds (list[torch.Tensor]): Multi-level direction class predictions. - gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes \ + gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes of each sample. gt_labels (list[torch.Tensor]): Labels of each sample. input_metas (list[dict]): Point cloud and image's meta info. - gt_bboxes_ignore (None | list[torch.Tensor]): Specify + gt_bboxes_ignore (list[torch.Tensor]): Specify which bounding. Returns: - dict[str, list[torch.Tensor]]: Classification, bbox, and \ + dict[str, list[torch.Tensor]]: Classification, bbox, and direction losses of each level. - loss_rpn_cls (list[torch.Tensor]): Classification losses. - loss_rpn_bbox (list[torch.Tensor]): Box regression losses. - - loss_rpn_dir (list[torch.Tensor]): Direction classification \ + - loss_rpn_dir (list[torch.Tensor]): Direction classification losses. """ loss_dict = super().loss(cls_scores, bbox_preds, dir_cls_preds, @@ -143,7 +143,7 @@ def get_bboxes_single(self, mlvl_anchors (List[torch.Tensor]): Multi-level anchors in single batch. input_meta (list[dict]): Contain pcd and img's meta info. - cfg (None | :obj:`ConfigDict`): Training or testing config. + cfg (:obj:`ConfigDict`): Training or testing config. rescale (list[torch.Tensor]): whether th rescale bbox. Returns: @@ -207,7 +207,7 @@ def get_bboxes_single(self, mlvl_dir_scores = torch.cat(mlvl_dir_scores) # shape [k, num_class] before sigmoid # PartA2 need to keep raw classification score - # becase the bbox head in the second stage does not have + # because the bbox head in the second stage does not have # classification branch, # roi head need this score as classification score mlvl_cls_score = torch.cat(mlvl_cls_score) @@ -240,7 +240,7 @@ def class_agnostic_nms(self, mlvl_bboxes, mlvl_bboxes_for_nms, Multi-level bbox. score_thr (int): Score threshold. max_num (int): Max number of bboxes after nms. - cfg (None | :obj:`ConfigDict`): Training or testing config. + cfg (:obj:`ConfigDict`): Training or testing config. input_meta (dict): Contain pcd and img's meta info. Returns: diff --git a/mmdet3d/models/dense_heads/pgd_head.py b/mmdet3d/models/dense_heads/pgd_head.py new file mode 100644 index 0000000000..0c71b8d27b --- /dev/null +++ b/mmdet3d/models/dense_heads/pgd_head.py @@ -0,0 +1,1229 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch +from mmcv.cnn import Scale, bias_init_with_prob, normal_init +from mmcv.runner import force_fp32 +from torch import nn as nn +from torch.nn import functional as F + +from mmdet3d.core import box3d_multiclass_nms, xywhr2xyxyr +from mmdet3d.core.bbox import points_cam2img, points_img2cam +from mmdet.core import distance2bbox, multi_apply +from mmdet.models.builder import HEADS, build_loss +from .fcos_mono3d_head import FCOSMono3DHead + + +@HEADS.register_module() +class PGDHead(FCOSMono3DHead): + r"""Anchor-free head used in `PGD `_. + + Args: + use_depth_classifer (bool, optional): Whether to use depth classifier. + Defaults to True. + use_only_reg_proj (bool, optional): Whether to use only direct + regressed depth in the re-projection (to make the network easier + to learn). Defaults to False. + weight_dim (int, optional): Dimension of the location-aware weight + map. Defaults to -1. + weight_branch (tuple[tuple[int]], optional): Feature map channels of + the convolutional branch for weight map. Defaults to ((256, ), ). + depth_branch (tuple[int], optional): Feature map channels of the + branch for probabilistic depth estimation. Defaults to (64, ), + depth_range (tuple[float], optional): Range of depth estimation. + Defaults to (0, 70), + depth_unit (int, optional): Unit of depth range division. Defaults to + 10. + division (str, optional): Depth division method. Options include + 'uniform', 'linear', 'log', 'loguniform'. Defaults to 'uniform'. + depth_bins (int, optional): Discrete bins of depth division. Defaults + to 8. + loss_depth (dict, optional): Depth loss. Defaults to dict( + type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0). + loss_bbox2d (dict, optional): Loss for 2D box estimation. Defaults to + dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0). + loss_consistency (dict, optional): Consistency loss. Defaults to + dict(type='GIoULoss', loss_weight=1.0), + pred_velo (bool, optional): Whether to predict velocity. Defaults to + False. + pred_bbox2d (bool, optional): Whether to predict 2D bounding boxes. + Defaults to True. + pred_keypoints (bool, optional): Whether to predict keypoints. + Defaults to False, + bbox_coder (dict, optional): Bounding box coder. Defaults to + dict(type='PGDBBoxCoder', base_depths=((28.01, 16.32), ), + base_dims=((0.8, 1.73, 0.6), (1.76, 1.73, 0.6), (3.9, 1.56, 1.6)), + code_size=7). + """ + + def __init__(self, + use_depth_classifier=True, + use_onlyreg_proj=False, + weight_dim=-1, + weight_branch=((256, ), ), + depth_branch=(64, ), + depth_range=(0, 70), + depth_unit=10, + division='uniform', + depth_bins=8, + loss_depth=dict( + type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), + loss_bbox2d=dict( + type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), + loss_consistency=dict(type='GIoULoss', loss_weight=1.0), + pred_bbox2d=True, + pred_keypoints=False, + bbox_coder=dict( + type='PGDBBoxCoder', + base_depths=((28.01, 16.32), ), + base_dims=((0.8, 1.73, 0.6), (1.76, 1.73, 0.6), + (3.9, 1.56, 1.6)), + code_size=7), + **kwargs): + self.use_depth_classifier = use_depth_classifier + self.use_onlyreg_proj = use_onlyreg_proj + self.depth_branch = depth_branch + self.pred_keypoints = pred_keypoints + self.weight_dim = weight_dim + self.weight_branch = weight_branch + self.weight_out_channels = [] + for weight_branch_channels in weight_branch: + if len(weight_branch_channels) > 0: + self.weight_out_channels.append(weight_branch_channels[-1]) + else: + self.weight_out_channels.append(-1) + self.depth_range = depth_range + self.depth_unit = depth_unit + self.division = division + if self.division == 'uniform': + self.num_depth_cls = int( + (depth_range[1] - depth_range[0]) / depth_unit) + 1 + if self.num_depth_cls != depth_bins: + print('Warning: The number of bins computed from ' + + 'depth_unit is different from given parameter! ' + + 'Depth_unit will be considered with priority in ' + + 'Uniform Division.') + else: + self.num_depth_cls = depth_bins + super().__init__( + pred_bbox2d=pred_bbox2d, bbox_coder=bbox_coder, **kwargs) + self.loss_depth = build_loss(loss_depth) + if self.pred_bbox2d: + self.loss_bbox2d = build_loss(loss_bbox2d) + self.loss_consistency = build_loss(loss_consistency) + if self.pred_keypoints: + self.kpts_start = 9 if self.pred_velo else 7 + + def _init_layers(self): + """Initialize layers of the head.""" + super()._init_layers() + if self.pred_bbox2d: + self.scale_dim += 1 + if self.pred_keypoints: + self.scale_dim += 1 + self.scales = nn.ModuleList([ + nn.ModuleList([Scale(1.0) for _ in range(self.scale_dim)]) + for _ in self.strides + ]) + + def _init_predictor(self): + """Initialize predictor layers of the head.""" + super()._init_predictor() + + if self.use_depth_classifier: + self.conv_depth_cls_prev = self._init_branch( + conv_channels=self.depth_branch, + conv_strides=(1, ) * len(self.depth_branch)) + self.conv_depth_cls = nn.Conv2d(self.depth_branch[-1], + self.num_depth_cls, 1) + # Data-agnostic single param lambda for local depth fusion + self.fuse_lambda = nn.Parameter(torch.tensor(10e-5)) + + if self.weight_dim != -1: + self.conv_weight_prevs = nn.ModuleList() + self.conv_weights = nn.ModuleList() + for i in range(self.weight_dim): + weight_branch_channels = self.weight_branch[i] + weight_out_channel = self.weight_out_channels[i] + if len(weight_branch_channels) > 0: + self.conv_weight_prevs.append( + self._init_branch( + conv_channels=weight_branch_channels, + conv_strides=(1, ) * len(weight_branch_channels))) + self.conv_weights.append( + nn.Conv2d(weight_out_channel, 1, 1)) + else: + self.conv_weight_prevs.append(None) + self.conv_weights.append( + nn.Conv2d(self.feat_channels, 1, 1)) + + def init_weights(self): + """Initialize weights of the head. + + We currently still use the customized defined init_weights because the + default init of DCN triggered by the init_cfg will init + conv_offset.weight, which mistakenly affects the training stability. + """ + super().init_weights() + + bias_cls = bias_init_with_prob(0.01) + if self.use_depth_classifier: + for m in self.conv_depth_cls_prev: + if isinstance(m.conv, nn.Conv2d): + normal_init(m.conv, std=0.01) + normal_init(self.conv_depth_cls, std=0.01, bias=bias_cls) + + if self.weight_dim != -1: + for conv_weight_prev in self.conv_weight_prevs: + if conv_weight_prev is None: + continue + for m in conv_weight_prev: + if isinstance(m.conv, nn.Conv2d): + normal_init(m.conv, std=0.01) + for conv_weight in self.conv_weights: + normal_init(conv_weight, std=0.01) + + def forward(self, feats): + """Forward features from the upstream network. + + Args: + feats (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: + cls_scores (list[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_points * num_classes. + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_points * bbox_code_size. + dir_cls_preds (list[Tensor]): Box scores for direction class + predictions on each scale level, each is a 4D-tensor, + the channel number is num_points * 2. (bin = 2). + weight (list[Tensor]): Location-aware weight maps on each + scale level, each is a 4D-tensor, the channel number is + num_points * 1. + depth_cls_preds (list[Tensor]): Box scores for depth class + predictions on each scale level, each is a 4D-tensor, + the channel number is num_points * self.num_depth_cls. + attr_preds (list[Tensor]): Attribute scores for each scale + level, each is a 4D-tensor, the channel number is + num_points * num_attrs. + centernesses (list[Tensor]): Centerness for each scale level, + each is a 4D-tensor, the channel number is num_points * 1. + """ + return multi_apply(self.forward_single, feats, self.scales, + self.strides) + + def forward_single(self, x, scale, stride): + """Forward features of a single scale level. + + Args: + x (Tensor): FPN feature maps of the specified stride. + scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize + the bbox prediction. + stride (int): The corresponding stride for feature maps, only + used to normalize the bbox prediction when self.norm_on_bbox + is True. + + Returns: + tuple: scores for each class, bbox and direction class + predictions, depth class predictions, location-aware weights, + attribute and centerness predictions of input feature maps. + """ + cls_score, bbox_pred, dir_cls_pred, attr_pred, centerness, cls_feat, \ + reg_feat = super().forward_single(x, scale, stride) + + max_regress_range = stride * self.regress_ranges[0][1] / \ + self.strides[0] + bbox_pred = self.bbox_coder.decode_2d(bbox_pred, scale, stride, + max_regress_range, self.training, + self.pred_keypoints, + self.pred_bbox2d) + + depth_cls_pred = None + if self.use_depth_classifier: + clone_reg_feat = reg_feat.clone() + for conv_depth_cls_prev_layer in self.conv_depth_cls_prev: + clone_reg_feat = conv_depth_cls_prev_layer(clone_reg_feat) + depth_cls_pred = self.conv_depth_cls(clone_reg_feat) + + weight = None + if self.weight_dim != -1: + weight = [] + for i in range(self.weight_dim): + clone_reg_feat = reg_feat.clone() + if len(self.weight_branch[i]) > 0: + for conv_weight_prev_layer in self.conv_weight_prevs[i]: + clone_reg_feat = conv_weight_prev_layer(clone_reg_feat) + weight.append(self.conv_weights[i](clone_reg_feat)) + weight = torch.cat(weight, dim=1) + + return cls_score, bbox_pred, dir_cls_pred, depth_cls_pred, weight, \ + attr_pred, centerness + + def get_proj_bbox2d(self, + bbox_preds, + pos_dir_cls_preds, + labels_3d, + bbox_targets_3d, + pos_points, + pos_inds, + img_metas, + pos_depth_cls_preds=None, + pos_weights=None, + pos_cls_scores=None, + with_kpts=False): + """Decode box predictions and get projected 2D attributes. + + Args: + bbox_preds (list[Tensor]): Box predictions for each scale + level, each is a 4D-tensor, the channel number is + num_points * bbox_code_size. + pos_dir_cls_preds (Tensor): Box scores for direction class + predictions of positive boxes on all the scale levels in shape + (num_pos_points, 2). + labels_3d (list[Tensor]): 3D box category labels for each scale + level, each is a 4D-tensor. + bbox_targets_3d (list[Tensor]): 3D box targets for each scale + level, each is a 4D-tensor, the channel number is + num_points * bbox_code_size. + pos_points (Tensor): Foreground points. + pos_inds (Tensor): Index of foreground points from flattened + tensors. + img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + pos_depth_cls_preds (Tensor, optional): Probabilistic depth map of + positive boxes on all the scale levels in shape + (num_pos_points, self.num_depth_cls). Defaults to None. + pos_weights (Tensor, optional): Location-aware weights of positive + boxes in shape (num_pos_points, self.weight_dim). Defaults to + None. + pos_cls_scores (Tensor, optional): Classification scores of + positive boxes in shape (num_pos_points, self.num_classes). + Defaults to None. + with_kpts (bool, optional): Whether to output keypoints targets. + Defaults to False. + + Returns: + tuple[Tensor]: Exterior 2D boxes from projected 3D boxes, + predicted 2D boxes and keypoint targets (if necessary). + """ + views = [np.array(img_meta['cam2img']) for img_meta in img_metas] + num_imgs = len(img_metas) + img_idx = [] + for label in labels_3d: + for idx in range(num_imgs): + img_idx.append( + labels_3d[0].new_ones(int(len(label) / num_imgs)) * idx) + img_idx = torch.cat(img_idx) + pos_img_idx = img_idx[pos_inds] + + flatten_strided_bbox_preds = [] + flatten_strided_bbox2d_preds = [] + flatten_bbox_targets_3d = [] + flatten_strides = [] + + for stride_idx, bbox_pred in enumerate(bbox_preds): + flatten_bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape( + -1, sum(self.group_reg_dims)) + flatten_bbox_pred[:, :2] *= self.strides[stride_idx] + flatten_bbox_pred[:, -4:] *= self.strides[stride_idx] + flatten_strided_bbox_preds.append( + flatten_bbox_pred[:, :self.bbox_coder.bbox_code_size]) + flatten_strided_bbox2d_preds.append(flatten_bbox_pred[:, -4:]) + + bbox_target_3d = bbox_targets_3d[stride_idx].clone() + bbox_target_3d[:, :2] *= self.strides[stride_idx] + bbox_target_3d[:, -4:] *= self.strides[stride_idx] + flatten_bbox_targets_3d.append(bbox_target_3d) + + flatten_stride = flatten_bbox_pred.new_ones( + *flatten_bbox_pred.shape[:-1], 1) * self.strides[stride_idx] + flatten_strides.append(flatten_stride) + + flatten_strided_bbox_preds = torch.cat(flatten_strided_bbox_preds) + flatten_strided_bbox2d_preds = torch.cat(flatten_strided_bbox2d_preds) + flatten_bbox_targets_3d = torch.cat(flatten_bbox_targets_3d) + flatten_strides = torch.cat(flatten_strides) + pos_strided_bbox_preds = flatten_strided_bbox_preds[pos_inds] + pos_strided_bbox2d_preds = flatten_strided_bbox2d_preds[pos_inds] + pos_bbox_targets_3d = flatten_bbox_targets_3d[pos_inds] + pos_strides = flatten_strides[pos_inds] + + pos_decoded_bbox2d_preds = distance2bbox(pos_points, + pos_strided_bbox2d_preds) + + pos_strided_bbox_preds[:, :2] = \ + pos_points - pos_strided_bbox_preds[:, :2] + pos_bbox_targets_3d[:, :2] = \ + pos_points - pos_bbox_targets_3d[:, :2] + + if self.use_depth_classifier and (not self.use_onlyreg_proj): + pos_prob_depth_preds = self.bbox_coder.decode_prob_depth( + pos_depth_cls_preds, self.depth_range, self.depth_unit, + self.division, self.num_depth_cls) + sig_alpha = torch.sigmoid(self.fuse_lambda) + pos_strided_bbox_preds[:, 2] = \ + sig_alpha * pos_strided_bbox_preds.clone()[:, 2] + \ + (1 - sig_alpha) * pos_prob_depth_preds + + box_corners_in_image = pos_strided_bbox_preds.new_zeros( + (*pos_strided_bbox_preds.shape[:-1], 8, 2)) + box_corners_in_image_gt = pos_strided_bbox_preds.new_zeros( + (*pos_strided_bbox_preds.shape[:-1], 8, 2)) + + for idx in range(num_imgs): + mask = (pos_img_idx == idx) + if pos_strided_bbox_preds[mask].shape[0] == 0: + continue + cam2img = torch.eye( + 4, + dtype=pos_strided_bbox_preds.dtype, + device=pos_strided_bbox_preds.device) + view_shape = views[idx].shape + cam2img[:view_shape[0], :view_shape[1]] = \ + pos_strided_bbox_preds.new_tensor(views[idx]) + + centers2d_preds = pos_strided_bbox_preds.clone()[mask, :2] + centers2d_targets = pos_bbox_targets_3d.clone()[mask, :2] + centers3d_targets = points_img2cam(pos_bbox_targets_3d[mask, :3], + views[idx]) + + # use predicted depth to re-project the 2.5D centers + pos_strided_bbox_preds[mask, :3] = points_img2cam( + pos_strided_bbox_preds[mask, :3], views[idx]) + pos_bbox_targets_3d[mask, :3] = centers3d_targets + + # depth fixed when computing re-project 3D bboxes + pos_strided_bbox_preds[mask, 2] = \ + pos_bbox_targets_3d.clone()[mask, 2] + + # decode yaws + if self.use_direction_classifier: + pos_dir_cls_scores = torch.max( + pos_dir_cls_preds[mask], dim=-1)[1] + pos_strided_bbox_preds[mask] = self.bbox_coder.decode_yaw( + pos_strided_bbox_preds[mask], centers2d_preds, + pos_dir_cls_scores, self.dir_offset, cam2img) + pos_bbox_targets_3d[mask, 6] = torch.atan2( + centers2d_targets[:, 0] - cam2img[0, 2], + cam2img[0, 0]) + pos_bbox_targets_3d[mask, 6] + + corners = img_metas[0]['box_type_3d']( + pos_strided_bbox_preds[mask], + box_dim=self.bbox_coder.bbox_code_size, + origin=(0.5, 0.5, 0.5)).corners + box_corners_in_image[mask] = points_cam2img(corners, cam2img) + + corners_gt = img_metas[0]['box_type_3d']( + pos_bbox_targets_3d[mask, :self.bbox_code_size], + box_dim=self.bbox_coder.bbox_code_size, + origin=(0.5, 0.5, 0.5)).corners + box_corners_in_image_gt[mask] = points_cam2img(corners_gt, cam2img) + + minxy = torch.min(box_corners_in_image, dim=1)[0] + maxxy = torch.max(box_corners_in_image, dim=1)[0] + proj_bbox2d_preds = torch.cat([minxy, maxxy], dim=1) + + outputs = (proj_bbox2d_preds, pos_decoded_bbox2d_preds) + + if with_kpts: + norm_strides = pos_strides * self.regress_ranges[0][1] / \ + self.strides[0] + kpts_targets = box_corners_in_image_gt - pos_points[..., None, :] + kpts_targets = kpts_targets.view( + (*pos_strided_bbox_preds.shape[:-1], 16)) + kpts_targets /= norm_strides + + outputs += (kpts_targets, ) + + return outputs + + def get_pos_predictions(self, bbox_preds, dir_cls_preds, depth_cls_preds, + weights, attr_preds, centernesses, pos_inds, + img_metas): + """Flatten predictions and get positive ones. + + Args: + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_points * bbox_code_size. + dir_cls_preds (list[Tensor]): Box scores for direction class + predictions on each scale level, each is a 4D-tensor, + the channel number is num_points * 2. (bin = 2) + depth_cls_preds (list[Tensor]): Box scores for direction class + predictions on each scale level, each is a 4D-tensor, + the channel number is num_points * self.num_depth_cls. + attr_preds (list[Tensor]): Attribute scores for each scale level, + each is a 4D-tensor, the channel number is + num_points * num_attrs. + centernesses (list[Tensor]): Centerness for each scale level, each + is a 4D-tensor, the channel number is num_points * 1. + pos_inds (Tensor): Index of foreground points from flattened + tensors. + img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + + Returns: + tuple[Tensor]: Box predictions, direction classes, probabilistic + depth maps, location-aware weight maps, attributes and + centerness predictions. + """ + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(-1, sum(self.group_reg_dims)) + for bbox_pred in bbox_preds + ] + flatten_dir_cls_preds = [ + dir_cls_pred.permute(0, 2, 3, 1).reshape(-1, 2) + for dir_cls_pred in dir_cls_preds + ] + flatten_centerness = [ + centerness.permute(0, 2, 3, 1).reshape(-1) + for centerness in centernesses + ] + flatten_bbox_preds = torch.cat(flatten_bbox_preds) + flatten_dir_cls_preds = torch.cat(flatten_dir_cls_preds) + flatten_centerness = torch.cat(flatten_centerness) + pos_bbox_preds = flatten_bbox_preds[pos_inds] + pos_dir_cls_preds = flatten_dir_cls_preds[pos_inds] + pos_centerness = flatten_centerness[pos_inds] + + pos_depth_cls_preds = None + if self.use_depth_classifier: + flatten_depth_cls_preds = [ + depth_cls_pred.permute(0, 2, 3, + 1).reshape(-1, self.num_depth_cls) + for depth_cls_pred in depth_cls_preds + ] + flatten_depth_cls_preds = torch.cat(flatten_depth_cls_preds) + pos_depth_cls_preds = flatten_depth_cls_preds[pos_inds] + + pos_weights = None + if self.weight_dim != -1: + flatten_weights = [ + weight.permute(0, 2, 3, 1).reshape(-1, self.weight_dim) + for weight in weights + ] + flatten_weights = torch.cat(flatten_weights) + pos_weights = flatten_weights[pos_inds] + + pos_attr_preds = None + if self.pred_attrs: + flatten_attr_preds = [ + attr_pred.permute(0, 2, 3, 1).reshape(-1, self.num_attrs) + for attr_pred in attr_preds + ] + flatten_attr_preds = torch.cat(flatten_attr_preds) + pos_attr_preds = flatten_attr_preds[pos_inds] + + return pos_bbox_preds, pos_dir_cls_preds, pos_depth_cls_preds, \ + pos_weights, pos_attr_preds, pos_centerness + + @force_fp32( + apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds', + 'depth_cls_preds', 'weights', 'attr_preds', 'centernesses')) + def loss(self, + cls_scores, + bbox_preds, + dir_cls_preds, + depth_cls_preds, + weights, + attr_preds, + centernesses, + gt_bboxes, + gt_labels, + gt_bboxes_3d, + gt_labels_3d, + centers2d, + depths, + attr_labels, + img_metas, + gt_bboxes_ignore=None): + """Compute loss of the head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_points * num_classes. + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_points * bbox_code_size. + dir_cls_preds (list[Tensor]): Box scores for direction class + predictions on each scale level, each is a 4D-tensor, + the channel number is num_points * 2. (bin = 2) + depth_cls_preds (list[Tensor]): Box scores for direction class + predictions on each scale level, each is a 4D-tensor, + the channel number is num_points * self.num_depth_cls. + weights (list[Tensor]): Location-aware weights for each scale + level, each is a 4D-tensor, the channel number is + num_points * self.weight_dim. + attr_preds (list[Tensor]): Attribute scores for each scale level, + each is a 4D-tensor, the channel number is + num_points * num_attrs. + centernesses (list[Tensor]): Centerness for each scale level, each + is a 4D-tensor, the channel number is num_points * 1. + gt_bboxes (list[Tensor]): Ground truth bboxes for each image with + shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels (list[Tensor]): class indices corresponding to each box + gt_bboxes_3d (list[Tensor]): 3D boxes ground truth with shape of + (num_gts, code_size). + gt_labels_3d (list[Tensor]): same as gt_labels + centers2d (list[Tensor]): 2D centers on the image with shape of + (num_gts, 2). + depths (list[Tensor]): Depth ground truth with shape of + (num_gts, ). + attr_labels (list[Tensor]): Attributes indices of each box. + img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + gt_bboxes_ignore (list[Tensor]): specify which bounding boxes can + be ignored when computing the loss. Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + assert len(cls_scores) == len(bbox_preds) == len(dir_cls_preds) == \ + len(depth_cls_preds) == len(weights) == len(centernesses) == \ + len(attr_preds), 'The length of cls_scores, bbox_preds, ' \ + 'dir_cls_preds, depth_cls_preds, weights, centernesses, and' \ + f'attr_preds: {len(cls_scores)}, {len(bbox_preds)}, ' \ + f'{len(dir_cls_preds)}, {len(depth_cls_preds)}, {len(weights)}' \ + f'{len(centernesses)}, {len(attr_preds)} are inconsistent.' + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + all_level_points = self.get_points(featmap_sizes, bbox_preds[0].dtype, + bbox_preds[0].device) + labels_3d, bbox_targets_3d, centerness_targets, attr_targets = \ + self.get_targets( + all_level_points, gt_bboxes, gt_labels, gt_bboxes_3d, + gt_labels_3d, centers2d, depths, attr_labels) + + num_imgs = cls_scores[0].size(0) + # flatten cls_scores and targets + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels) + for cls_score in cls_scores + ] + flatten_cls_scores = torch.cat(flatten_cls_scores) + flatten_labels_3d = torch.cat(labels_3d) + flatten_bbox_targets_3d = torch.cat(bbox_targets_3d) + flatten_centerness_targets = torch.cat(centerness_targets) + flatten_points = torch.cat( + [points.repeat(num_imgs, 1) for points in all_level_points]) + if self.pred_attrs: + flatten_attr_targets = torch.cat(attr_targets) + + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + bg_class_ind = self.num_classes + pos_inds = ((flatten_labels_3d >= 0) + & (flatten_labels_3d < bg_class_ind)).nonzero().reshape(-1) + num_pos = len(pos_inds) + + loss_dict = dict() + + loss_dict['loss_cls'] = self.loss_cls( + flatten_cls_scores, + flatten_labels_3d, + avg_factor=num_pos + num_imgs) # avoid num_pos is 0 + + pos_bbox_preds, pos_dir_cls_preds, pos_depth_cls_preds, pos_weights, \ + pos_attr_preds, pos_centerness = self.get_pos_predictions( + bbox_preds, dir_cls_preds, depth_cls_preds, weights, + attr_preds, centernesses, pos_inds, img_metas) + + if num_pos > 0: + pos_bbox_targets_3d = flatten_bbox_targets_3d[pos_inds] + pos_centerness_targets = flatten_centerness_targets[pos_inds] + pos_points = flatten_points[pos_inds] + if self.pred_attrs: + pos_attr_targets = flatten_attr_targets[pos_inds] + if self.use_direction_classifier: + pos_dir_cls_targets = self.get_direction_target( + pos_bbox_targets_3d, self.dir_offset, one_hot=False) + + bbox_weights = pos_centerness_targets.new_ones( + len(pos_centerness_targets), sum(self.group_reg_dims)) + equal_weights = pos_centerness_targets.new_ones( + pos_centerness_targets.shape) + code_weight = self.train_cfg.get('code_weight', None) + if code_weight: + assert len(code_weight) == sum(self.group_reg_dims) + bbox_weights = bbox_weights * bbox_weights.new_tensor( + code_weight) + + if self.diff_rad_by_sin: + pos_bbox_preds, pos_bbox_targets_3d = self.add_sin_difference( + pos_bbox_preds, pos_bbox_targets_3d) + + loss_dict['loss_offset'] = self.loss_bbox( + pos_bbox_preds[:, :2], + pos_bbox_targets_3d[:, :2], + weight=bbox_weights[:, :2], + avg_factor=equal_weights.sum()) + loss_dict['loss_size'] = self.loss_bbox( + pos_bbox_preds[:, 3:6], + pos_bbox_targets_3d[:, 3:6], + weight=bbox_weights[:, 3:6], + avg_factor=equal_weights.sum()) + loss_dict['loss_rotsin'] = self.loss_bbox( + pos_bbox_preds[:, 6], + pos_bbox_targets_3d[:, 6], + weight=bbox_weights[:, 6], + avg_factor=equal_weights.sum()) + if self.pred_velo: + loss_dict['loss_velo'] = self.loss_bbox( + pos_bbox_preds[:, 7:9], + pos_bbox_targets_3d[:, 7:9], + weight=bbox_weights[:, 7:9], + avg_factor=equal_weights.sum()) + + proj_bbox2d_inputs = (bbox_preds, pos_dir_cls_preds, labels_3d, + bbox_targets_3d, pos_points, pos_inds, + img_metas) + + # direction classification loss + # TODO: add more check for use_direction_classifier + if self.use_direction_classifier: + loss_dict['loss_dir'] = self.loss_dir( + pos_dir_cls_preds, + pos_dir_cls_targets, + equal_weights, + avg_factor=equal_weights.sum()) + + # init depth loss with the one computed from direct regression + loss_dict['loss_depth'] = self.loss_bbox( + pos_bbox_preds[:, 2], + pos_bbox_targets_3d[:, 2], + weight=bbox_weights[:, 2], + avg_factor=equal_weights.sum()) + # depth classification loss + if self.use_depth_classifier: + pos_prob_depth_preds = self.bbox_coder.decode_prob_depth( + pos_depth_cls_preds, self.depth_range, self.depth_unit, + self.division, self.num_depth_cls) + sig_alpha = torch.sigmoid(self.fuse_lambda) + if self.weight_dim != -1: + loss_fuse_depth = self.loss_depth( + sig_alpha * pos_bbox_preds[:, 2] + + (1 - sig_alpha) * pos_prob_depth_preds, + pos_bbox_targets_3d[:, 2], + sigma=pos_weights[:, 0], + weight=bbox_weights[:, 2], + avg_factor=equal_weights.sum()) + else: + loss_fuse_depth = self.loss_depth( + sig_alpha * pos_bbox_preds[:, 2] + + (1 - sig_alpha) * pos_prob_depth_preds, + pos_bbox_targets_3d[:, 2], + weight=bbox_weights[:, 2], + avg_factor=equal_weights.sum()) + loss_dict['loss_depth'] = loss_fuse_depth + + proj_bbox2d_inputs += (pos_depth_cls_preds, ) + + if self.pred_keypoints: + # use smoothL1 to compute consistency loss for keypoints + # normalize the offsets with strides + proj_bbox2d_preds, pos_decoded_bbox2d_preds, kpts_targets = \ + self.get_proj_bbox2d(*proj_bbox2d_inputs, with_kpts=True) + loss_dict['loss_kpts'] = self.loss_bbox( + pos_bbox_preds[:, self.kpts_start:self.kpts_start + 16], + kpts_targets, + weight=bbox_weights[:, + self.kpts_start:self.kpts_start + 16], + avg_factor=equal_weights.sum()) + + if self.pred_bbox2d: + loss_dict['loss_bbox2d'] = self.loss_bbox2d( + pos_bbox_preds[:, -4:], + pos_bbox_targets_3d[:, -4:], + weight=bbox_weights[:, -4:], + avg_factor=equal_weights.sum()) + if not self.pred_keypoints: + proj_bbox2d_preds, pos_decoded_bbox2d_preds = \ + self.get_proj_bbox2d(*proj_bbox2d_inputs) + loss_dict['loss_consistency'] = self.loss_consistency( + proj_bbox2d_preds, + pos_decoded_bbox2d_preds, + weight=bbox_weights[:, -4:], + avg_factor=equal_weights.sum()) + + loss_dict['loss_centerness'] = self.loss_centerness( + pos_centerness, pos_centerness_targets) + + # attribute classification loss + if self.pred_attrs: + loss_dict['loss_attr'] = self.loss_attr( + pos_attr_preds, + pos_attr_targets, + pos_centerness_targets, + avg_factor=pos_centerness_targets.sum()) + + else: + # need absolute due to possible negative delta x/y + loss_dict['loss_offset'] = pos_bbox_preds[:, :2].sum() + loss_dict['loss_size'] = pos_bbox_preds[:, 3:6].sum() + loss_dict['loss_rotsin'] = pos_bbox_preds[:, 6].sum() + loss_dict['loss_depth'] = pos_bbox_preds[:, 2].sum() + if self.pred_velo: + loss_dict['loss_velo'] = pos_bbox_preds[:, 7:9].sum() + if self.pred_keypoints: + loss_dict['loss_kpts'] = pos_bbox_preds[:, + self.kpts_start:self. + kpts_start + 16].sum() + if self.pred_bbox2d: + loss_dict['loss_bbox2d'] = pos_bbox_preds[:, -4:].sum() + loss_dict['loss_consistency'] = pos_bbox_preds[:, -4:].sum() + loss_dict['loss_centerness'] = pos_centerness.sum() + if self.use_direction_classifier: + loss_dict['loss_dir'] = pos_dir_cls_preds.sum() + if self.use_depth_classifier: + sig_alpha = torch.sigmoid(self.fuse_lambda) + loss_fuse_depth = \ + sig_alpha * pos_bbox_preds[:, 2].sum() + \ + (1 - sig_alpha) * pos_depth_cls_preds.sum() + if self.weight_dim != -1: + loss_fuse_depth *= torch.exp(-pos_weights[:, 0].sum()) + loss_dict['loss_depth'] = loss_fuse_depth + if self.pred_attrs: + loss_dict['loss_attr'] = pos_attr_preds.sum() + + return loss_dict + + @force_fp32( + apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds', + 'depth_cls_preds', 'weights', 'attr_preds', 'centernesses')) + def get_bboxes(self, + cls_scores, + bbox_preds, + dir_cls_preds, + depth_cls_preds, + weights, + attr_preds, + centernesses, + img_metas, + cfg=None, + rescale=None): + """Transform network output for a batch into bbox predictions. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + Has shape (N, num_points * num_classes, H, W) + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level with shape (N, num_points * 4, H, W) + dir_cls_preds (list[Tensor]): Box scores for direction class + predictions on each scale level, each is a 4D-tensor, + the channel number is num_points * 2. (bin = 2) + depth_cls_preds (list[Tensor]): Box scores for direction class + predictions on each scale level, each is a 4D-tensor, + the channel number is num_points * self.num_depth_cls. + weights (list[Tensor]): Location-aware weights for each scale + level, each is a 4D-tensor, the channel number is + num_points * self.weight_dim. + attr_preds (list[Tensor]): Attribute scores for each scale level + Has shape (N, num_points * num_attrs, H, W) + centernesses (list[Tensor]): Centerness for each scale level with + shape (N, num_points * 1, H, W) + img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + cfg (mmcv.Config, optional): Test / postprocessing configuration, + if None, test_cfg would be used. Defaults to None. + rescale (bool, optional): If True, return boxes in original image + space. Defaults to None. + + Returns: + list[tuple[Tensor]]: Each item in result_list is a tuple, which + consists of predicted 3D boxes, scores, labels, attributes and + 2D boxes (if necessary). + """ + assert len(cls_scores) == len(bbox_preds) == len(dir_cls_preds) == \ + len(depth_cls_preds) == len(weights) == len(centernesses) == \ + len(attr_preds), 'The length of cls_scores, bbox_preds, ' \ + 'dir_cls_preds, depth_cls_preds, weights, centernesses, and' \ + f'attr_preds: {len(cls_scores)}, {len(bbox_preds)}, ' \ + f'{len(dir_cls_preds)}, {len(depth_cls_preds)}, {len(weights)}' \ + f'{len(centernesses)}, {len(attr_preds)} are inconsistent.' + num_levels = len(cls_scores) + + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + mlvl_points = self.get_points(featmap_sizes, bbox_preds[0].dtype, + bbox_preds[0].device) + result_list = [] + for img_id in range(len(img_metas)): + cls_score_list = [ + cls_scores[i][img_id].detach() for i in range(num_levels) + ] + bbox_pred_list = [ + bbox_preds[i][img_id].detach() for i in range(num_levels) + ] + if self.use_direction_classifier: + dir_cls_pred_list = [ + dir_cls_preds[i][img_id].detach() + for i in range(num_levels) + ] + else: + dir_cls_pred_list = [ + cls_scores[i][img_id].new_full( + [2, *cls_scores[i][img_id].shape[1:]], 0).detach() + for i in range(num_levels) + ] + if self.use_depth_classifier: + depth_cls_pred_list = [ + depth_cls_preds[i][img_id].detach() + for i in range(num_levels) + ] + else: + depth_cls_pred_list = [ + cls_scores[i][img_id].new_full( + [self.num_depth_cls, *cls_scores[i][img_id].shape[1:]], + 0).detach() for i in range(num_levels) + ] + if self.weight_dim != -1: + weight_list = [ + weights[i][img_id].detach() for i in range(num_levels) + ] + else: + weight_list = [ + cls_scores[i][img_id].new_full( + [1, *cls_scores[i][img_id].shape[1:]], 0).detach() + for i in range(num_levels) + ] + if self.pred_attrs: + attr_pred_list = [ + attr_preds[i][img_id].detach() for i in range(num_levels) + ] + else: + attr_pred_list = [ + cls_scores[i][img_id].new_full( + [self.num_attrs, *cls_scores[i][img_id].shape[1:]], + self.attr_background_label).detach() + for i in range(num_levels) + ] + centerness_pred_list = [ + centernesses[i][img_id].detach() for i in range(num_levels) + ] + input_meta = img_metas[img_id] + det_bboxes = self._get_bboxes_single( + cls_score_list, bbox_pred_list, dir_cls_pred_list, + depth_cls_pred_list, weight_list, attr_pred_list, + centerness_pred_list, mlvl_points, input_meta, cfg, rescale) + result_list.append(det_bboxes) + return result_list + + def _get_bboxes_single(self, + cls_scores, + bbox_preds, + dir_cls_preds, + depth_cls_preds, + weights, + attr_preds, + centernesses, + mlvl_points, + input_meta, + cfg, + rescale=False): + """Transform outputs for a single batch item into bbox predictions. + + Args: + cls_scores (list[Tensor]): Box scores for a single scale level + Has shape (num_points * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for a single scale + level with shape (num_points * bbox_code_size, H, W). + dir_cls_preds (list[Tensor]): Box scores for direction class + predictions on a single scale level with shape + (num_points * 2, H, W) + depth_cls_preds (list[Tensor]): Box scores for probabilistic depth + predictions on a single scale level with shape + (num_points * self.num_depth_cls, H, W) + weights (list[Tensor]): Location-aware weight maps on a single + scale level with shape (num_points * self.weight_dim, H, W). + attr_preds (list[Tensor]): Attribute scores for each scale level + Has shape (N, num_points * num_attrs, H, W) + centernesses (list[Tensor]): Centerness for a single scale level + with shape (num_points, H, W). + mlvl_points (list[Tensor]): Box reference for a single scale level + with shape (num_total_points, 2). + input_meta (dict): Metadata of input image. + cfg (mmcv.Config): Test / postprocessing configuration, + if None, test_cfg would be used. + rescale (bool, optional): If True, return boxes in original image + space. Defaults to False. + + Returns: + tuples[Tensor]: Predicted 3D boxes, scores, labels, attributes and + 2D boxes (if necessary). + """ + view = np.array(input_meta['cam2img']) + scale_factor = input_meta['scale_factor'] + cfg = self.test_cfg if cfg is None else cfg + assert len(cls_scores) == len(bbox_preds) == len(mlvl_points) + mlvl_centers2d = [] + mlvl_bboxes = [] + mlvl_scores = [] + mlvl_dir_scores = [] + mlvl_attr_scores = [] + mlvl_centerness = [] + mlvl_depth_cls_scores = [] + mlvl_depth_uncertainty = [] + mlvl_bboxes2d = None + if self.pred_bbox2d: + mlvl_bboxes2d = [] + + for cls_score, bbox_pred, dir_cls_pred, depth_cls_pred, weight, \ + attr_pred, centerness, points in zip( + cls_scores, bbox_preds, dir_cls_preds, depth_cls_preds, + weights, attr_preds, centernesses, mlvl_points): + assert cls_score.size()[-2:] == bbox_pred.size()[-2:] + scores = cls_score.permute(1, 2, 0).reshape( + -1, self.cls_out_channels).sigmoid() + dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2) + dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1] + depth_cls_pred = depth_cls_pred.permute(1, 2, 0).reshape( + -1, self.num_depth_cls) + depth_cls_score = F.softmax( + depth_cls_pred, dim=-1).topk( + k=2, dim=-1)[0].mean(dim=-1) + if self.weight_dim != -1: + weight = weight.permute(1, 2, 0).reshape(-1, self.weight_dim) + else: + weight = weight.permute(1, 2, 0).reshape(-1, 1) + depth_uncertainty = torch.exp(-weight[:, -1]) + attr_pred = attr_pred.permute(1, 2, 0).reshape(-1, self.num_attrs) + attr_score = torch.max(attr_pred, dim=-1)[1] + centerness = centerness.permute(1, 2, 0).reshape(-1).sigmoid() + + bbox_pred = bbox_pred.permute(1, 2, + 0).reshape(-1, + sum(self.group_reg_dims)) + bbox_pred3d = bbox_pred[:, :self.bbox_coder.bbox_code_size] + if self.pred_bbox2d: + bbox_pred2d = bbox_pred[:, -4:] + nms_pre = cfg.get('nms_pre', -1) + if nms_pre > 0 and scores.shape[0] > nms_pre: + merged_scores = scores * centerness[:, None] + if self.use_depth_classifier: + merged_scores *= depth_cls_score[:, None] + if self.weight_dim != -1: + merged_scores *= depth_uncertainty[:, None] + max_scores, _ = merged_scores.max(dim=1) + _, topk_inds = max_scores.topk(nms_pre) + points = points[topk_inds, :] + bbox_pred3d = bbox_pred3d[topk_inds, :] + scores = scores[topk_inds, :] + dir_cls_pred = dir_cls_pred[topk_inds, :] + depth_cls_pred = depth_cls_pred[topk_inds, :] + centerness = centerness[topk_inds] + dir_cls_score = dir_cls_score[topk_inds] + depth_cls_score = depth_cls_score[topk_inds] + depth_uncertainty = depth_uncertainty[topk_inds] + attr_score = attr_score[topk_inds] + if self.pred_bbox2d: + bbox_pred2d = bbox_pred2d[topk_inds, :] + # change the offset to actual center predictions + bbox_pred3d[:, :2] = points - bbox_pred3d[:, :2] + if rescale: + bbox_pred3d[:, :2] /= bbox_pred3d[:, :2].new_tensor( + scale_factor) + if self.pred_bbox2d: + bbox_pred2d /= bbox_pred2d.new_tensor(scale_factor) + if self.use_depth_classifier: + prob_depth_pred = self.bbox_coder.decode_prob_depth( + depth_cls_pred, self.depth_range, self.depth_unit, + self.division, self.num_depth_cls) + sig_alpha = torch.sigmoid(self.fuse_lambda) + bbox_pred3d[:, 2] = sig_alpha * bbox_pred3d[:, 2] + \ + (1 - sig_alpha) * prob_depth_pred + pred_center2d = bbox_pred3d[:, :3].clone() + bbox_pred3d[:, :3] = points_img2cam(bbox_pred3d[:, :3], view) + mlvl_centers2d.append(pred_center2d) + mlvl_bboxes.append(bbox_pred3d) + mlvl_scores.append(scores) + mlvl_dir_scores.append(dir_cls_score) + mlvl_depth_cls_scores.append(depth_cls_score) + mlvl_attr_scores.append(attr_score) + mlvl_centerness.append(centerness) + mlvl_depth_uncertainty.append(depth_uncertainty) + if self.pred_bbox2d: + bbox_pred2d = distance2bbox( + points, bbox_pred2d, max_shape=input_meta['img_shape']) + mlvl_bboxes2d.append(bbox_pred2d) + + mlvl_centers2d = torch.cat(mlvl_centers2d) + mlvl_bboxes = torch.cat(mlvl_bboxes) + mlvl_dir_scores = torch.cat(mlvl_dir_scores) + if self.pred_bbox2d: + mlvl_bboxes2d = torch.cat(mlvl_bboxes2d) + + # change local yaw to global yaw for 3D nms + cam2img = torch.eye( + 4, dtype=mlvl_centers2d.dtype, device=mlvl_centers2d.device) + cam2img[:view.shape[0], :view.shape[1]] = \ + mlvl_centers2d.new_tensor(view) + mlvl_bboxes = self.bbox_coder.decode_yaw(mlvl_bboxes, mlvl_centers2d, + mlvl_dir_scores, + self.dir_offset, cam2img) + + mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d']( + mlvl_bboxes, + box_dim=self.bbox_coder.bbox_code_size, + origin=(0.5, 0.5, 0.5)).bev) + + mlvl_scores = torch.cat(mlvl_scores) + padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1) + # remind that we set FG labels to [0, num_class-1] since mmdet v2.0 + # BG cat_id: num_class + mlvl_scores = torch.cat([mlvl_scores, padding], dim=1) + mlvl_attr_scores = torch.cat(mlvl_attr_scores) + mlvl_centerness = torch.cat(mlvl_centerness) + # no scale_factors in box3d_multiclass_nms + # Then we multiply it from outside + mlvl_nms_scores = mlvl_scores * mlvl_centerness[:, None] + if self.use_depth_classifier: # multiply the depth confidence + mlvl_depth_cls_scores = torch.cat(mlvl_depth_cls_scores) + mlvl_nms_scores *= mlvl_depth_cls_scores[:, None] + if self.weight_dim != -1: + mlvl_depth_uncertainty = torch.cat(mlvl_depth_uncertainty) + mlvl_nms_scores *= mlvl_depth_uncertainty[:, None] + results = box3d_multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms, + mlvl_nms_scores, cfg.score_thr, + cfg.max_per_img, cfg, mlvl_dir_scores, + mlvl_attr_scores, mlvl_bboxes2d) + bboxes, scores, labels, dir_scores, attrs = results[0:5] + attrs = attrs.to(labels.dtype) # change data type to int + bboxes = input_meta['box_type_3d']( + bboxes, + box_dim=self.bbox_coder.bbox_code_size, + origin=(0.5, 0.5, 0.5)) + # Note that the predictions use origin (0.5, 0.5, 0.5) + # Due to the ground truth centers2d are the gravity center of objects + # v0.10.0 fix inplace operation to the input tensor of cam_box3d + # So here we also need to add origin=(0.5, 0.5, 0.5) + if not self.pred_attrs: + attrs = None + + outputs = (bboxes, scores, labels, attrs) + if self.pred_bbox2d: + bboxes2d = results[-1] + bboxes2d = torch.cat([bboxes2d, scores[:, None]], dim=1) + outputs = outputs + (bboxes2d, ) + + return outputs + + def get_targets(self, points, gt_bboxes_list, gt_labels_list, + gt_bboxes_3d_list, gt_labels_3d_list, centers2d_list, + depths_list, attr_labels_list): + """Compute regression, classification and centerss targets for points + in multiple images. + + Args: + points (list[Tensor]): Points of each fpn level, each has shape + (num_points, 2). + gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image, + each has shape (num_gt, 4). + gt_labels_list (list[Tensor]): Ground truth labels of each box, + each has shape (num_gt,). + gt_bboxes_3d_list (list[Tensor]): 3D Ground truth bboxes of each + image, each has shape (num_gt, bbox_code_size). + gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of each + box, each has shape (num_gt,). + centers2d_list (list[Tensor]): Projected 3D centers onto 2D image, + each has shape (num_gt, 2). + depths_list (list[Tensor]): Depth of projected 3D centers onto 2D + image, each has shape (num_gt, 1). + attr_labels_list (list[Tensor]): Attribute labels of each box, + each has shape (num_gt,). + + Returns: + tuple: + concat_lvl_labels (list[Tensor]): Labels of each level. \ + concat_lvl_bbox_targets (list[Tensor]): BBox targets of each \ + level. + """ + assert len(points) == len(self.regress_ranges) + num_levels = len(points) + # expand regress ranges to align with points + expanded_regress_ranges = [ + points[i].new_tensor(self.regress_ranges[i])[None].expand_as( + points[i]) for i in range(num_levels) + ] + # concat all levels points and regress ranges + concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0) + concat_points = torch.cat(points, dim=0) + + # the number of points per img, per lvl + num_points = [center.size(0) for center in points] + + if attr_labels_list is None: + attr_labels_list = [ + gt_labels.new_full(gt_labels.shape, self.attr_background_label) + for gt_labels in gt_labels_list + ] + + # get labels and bbox_targets of each image + _, bbox_targets_list, labels_3d_list, bbox_targets_3d_list, \ + centerness_targets_list, attr_targets_list = multi_apply( + self._get_target_single, + gt_bboxes_list, + gt_labels_list, + gt_bboxes_3d_list, + gt_labels_3d_list, + centers2d_list, + depths_list, + attr_labels_list, + points=concat_points, + regress_ranges=concat_regress_ranges, + num_points_per_lvl=num_points) + + # split to per img, per level + bbox_targets_list = [ + bbox_targets.split(num_points, 0) + for bbox_targets in bbox_targets_list + ] + labels_3d_list = [ + labels_3d.split(num_points, 0) for labels_3d in labels_3d_list + ] + bbox_targets_3d_list = [ + bbox_targets_3d.split(num_points, 0) + for bbox_targets_3d in bbox_targets_3d_list + ] + centerness_targets_list = [ + centerness_targets.split(num_points, 0) + for centerness_targets in centerness_targets_list + ] + attr_targets_list = [ + attr_targets.split(num_points, 0) + for attr_targets in attr_targets_list + ] + + # concat per level image + concat_lvl_labels_3d = [] + concat_lvl_bbox_targets_3d = [] + concat_lvl_centerness_targets = [] + concat_lvl_attr_targets = [] + for i in range(num_levels): + concat_lvl_labels_3d.append( + torch.cat([labels[i] for labels in labels_3d_list])) + concat_lvl_centerness_targets.append( + torch.cat([ + centerness_targets[i] + for centerness_targets in centerness_targets_list + ])) + bbox_targets_3d = torch.cat([ + bbox_targets_3d[i] for bbox_targets_3d in bbox_targets_3d_list + ]) + if self.pred_bbox2d: + bbox_targets = torch.cat( + [bbox_targets[i] for bbox_targets in bbox_targets_list]) + bbox_targets_3d = torch.cat([bbox_targets_3d, bbox_targets], + dim=1) + concat_lvl_attr_targets.append( + torch.cat( + [attr_targets[i] for attr_targets in attr_targets_list])) + if self.norm_on_bbox: + bbox_targets_3d[:, :2] = \ + bbox_targets_3d[:, :2] / self.strides[i] + if self.pred_bbox2d: + bbox_targets_3d[:, -4:] = \ + bbox_targets_3d[:, -4:] / self.strides[i] + concat_lvl_bbox_targets_3d.append(bbox_targets_3d) + return concat_lvl_labels_3d, concat_lvl_bbox_targets_3d, \ + concat_lvl_centerness_targets, concat_lvl_attr_targets diff --git a/mmdet3d/models/dense_heads/point_rpn_head.py b/mmdet3d/models/dense_heads/point_rpn_head.py new file mode 100644 index 0000000000..48ed1324e4 --- /dev/null +++ b/mmdet3d/models/dense_heads/point_rpn_head.py @@ -0,0 +1,377 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from mmcv.runner import BaseModule, force_fp32 +from torch import nn as nn + +from mmdet3d.core.bbox.structures import (DepthInstance3DBoxes, + LiDARInstance3DBoxes) +from mmdet3d.ops.iou3d.iou3d_utils import nms_gpu, nms_normal_gpu +from mmdet.core import build_bbox_coder, multi_apply +from mmdet.models import HEADS, build_loss + + +@HEADS.register_module() +class PointRPNHead(BaseModule): + """RPN module for PointRCNN. + + Args: + num_classes (int): Number of classes. + train_cfg (dict): Train configs. + test_cfg (dict): Test configs. + pred_layer_cfg (dict, optional): Config of classfication and + regression prediction layers. Defaults to None. + enlarge_width (float, optional): Enlarge bbox for each side to ignore + close points. Defaults to 0.1. + cls_loss (dict, optional): Config of direction classification loss. + Defaults to None. + bbox_loss (dict, optional): Config of localization loss. + Defaults to None. + bbox_coder (dict, optional): Config dict of box coders. + Defaults to None. + init_cfg (dict, optional): Config of initialization. Defaults to None. + """ + + def __init__(self, + num_classes, + train_cfg, + test_cfg, + pred_layer_cfg=None, + enlarge_width=0.1, + cls_loss=None, + bbox_loss=None, + bbox_coder=None, + init_cfg=None): + super().__init__(init_cfg=init_cfg) + self.num_classes = num_classes + self.train_cfg = train_cfg + self.test_cfg = test_cfg + self.enlarge_width = enlarge_width + + # build loss function + self.bbox_loss = build_loss(bbox_loss) + self.cls_loss = build_loss(cls_loss) + + # build box coder + self.bbox_coder = build_bbox_coder(bbox_coder) + + # build pred conv + self.cls_layers = self._make_fc_layers( + fc_cfg=pred_layer_cfg.cls_linear_channels, + input_channels=pred_layer_cfg.in_channels, + output_channels=self._get_cls_out_channels()) + + self.reg_layers = self._make_fc_layers( + fc_cfg=pred_layer_cfg.reg_linear_channels, + input_channels=pred_layer_cfg.in_channels, + output_channels=self._get_reg_out_channels()) + + def _make_fc_layers(self, fc_cfg, input_channels, output_channels): + """Make fully connect layers. + + Args: + fc_cfg (dict): Config of fully connect. + input_channels (int): Input channels for fc_layers. + output_channels (int): Input channels for fc_layers. + + Returns: + nn.Sequential: Fully connect layers. + """ + fc_layers = [] + c_in = input_channels + for k in range(0, fc_cfg.__len__()): + fc_layers.extend([ + nn.Linear(c_in, fc_cfg[k], bias=False), + nn.BatchNorm1d(fc_cfg[k]), + nn.ReLU(), + ]) + c_in = fc_cfg[k] + fc_layers.append(nn.Linear(c_in, output_channels, bias=True)) + return nn.Sequential(*fc_layers) + + def _get_cls_out_channels(self): + """Return the channel number of classification outputs.""" + # Class numbers (k) + objectness (1) + return self.num_classes + + def _get_reg_out_channels(self): + """Return the channel number of regression outputs.""" + # Bbox classification and regression + # (center residual (3), size regression (3) + # torch.cos(yaw) (1), torch.sin(yaw) (1) + return self.bbox_coder.code_size + + def forward(self, feat_dict): + """Forward pass. + + Args: + feat_dict (dict): Feature dict from backbone. + + Returns: + tuple[list[torch.Tensor]]: Predicted boxes and classification + scores. + """ + point_features = feat_dict['fp_features'] + point_features = point_features.permute(0, 2, 1).contiguous() + batch_size = point_features.shape[0] + feat_cls = point_features.view(-1, point_features.shape[-1]) + feat_reg = point_features.view(-1, point_features.shape[-1]) + + point_cls_preds = self.cls_layers(feat_cls).reshape( + batch_size, -1, self._get_cls_out_channels()) + point_box_preds = self.reg_layers(feat_reg).reshape( + batch_size, -1, self._get_reg_out_channels()) + return (point_box_preds, point_cls_preds) + + @force_fp32(apply_to=('bbox_preds')) + def loss(self, + bbox_preds, + cls_preds, + points, + gt_bboxes_3d, + gt_labels_3d, + img_metas=None): + """Compute loss. + + Args: + bbox_preds (dict): Predictions from forward of PointRCNN RPN_Head. + cls_preds (dict): Classification from forward of PointRCNN + RPN_Head. + points (list[torch.Tensor]): Input points. + gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth + bboxes of each sample. + gt_labels_3d (list[torch.Tensor]): Labels of each sample. + img_metas (list[dict], Optional): Contain pcd and img's meta info. + Defaults to None. + + Returns: + dict: Losses of PointRCNN RPN module. + """ + targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d) + (bbox_targets, mask_targets, positive_mask, negative_mask, + box_loss_weights, point_targets) = targets + + # bbox loss + bbox_loss = self.bbox_loss(bbox_preds, bbox_targets, + box_loss_weights.unsqueeze(-1)) + # calculate semantic loss + semantic_points = cls_preds.reshape(-1, self.num_classes) + semantic_targets = mask_targets + semantic_targets[negative_mask] = self.num_classes + semantic_points_label = semantic_targets + # for ignore, but now we do not have ignore label + semantic_loss_weight = negative_mask.float() + positive_mask.float() + semantic_loss = self.cls_loss(semantic_points, + semantic_points_label.reshape(-1), + semantic_loss_weight.reshape(-1)) + semantic_loss /= positive_mask.float().sum() + losses = dict(bbox_loss=bbox_loss, semantic_loss=semantic_loss) + + return losses + + def get_targets(self, points, gt_bboxes_3d, gt_labels_3d): + """Generate targets of PointRCNN RPN head. + + Args: + points (list[torch.Tensor]): Points of each batch. + gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth + bboxes of each batch. + gt_labels_3d (list[torch.Tensor]): Labels of each batch. + + Returns: + tuple[torch.Tensor]: Targets of PointRCNN RPN head. + """ + # find empty example + for index in range(len(gt_labels_3d)): + if len(gt_labels_3d[index]) == 0: + fake_box = gt_bboxes_3d[index].tensor.new_zeros( + 1, gt_bboxes_3d[index].tensor.shape[-1]) + gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box) + gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1) + + (bbox_targets, mask_targets, positive_mask, negative_mask, + point_targets) = multi_apply(self.get_targets_single, points, + gt_bboxes_3d, gt_labels_3d) + + bbox_targets = torch.stack(bbox_targets) + mask_targets = torch.stack(mask_targets) + positive_mask = torch.stack(positive_mask) + negative_mask = torch.stack(negative_mask) + box_loss_weights = positive_mask / (positive_mask.sum() + 1e-6) + + return (bbox_targets, mask_targets, positive_mask, negative_mask, + box_loss_weights, point_targets) + + def get_targets_single(self, points, gt_bboxes_3d, gt_labels_3d): + """Generate targets of PointRCNN RPN head for single batch. + + Args: + points (torch.Tensor): Points of each batch. + gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth + boxes of each batch. + gt_labels_3d (torch.Tensor): Labels of each batch. + + Returns: + tuple[torch.Tensor]: Targets of ssd3d head. + """ + gt_bboxes_3d = gt_bboxes_3d.to(points.device) + + valid_gt = gt_labels_3d != -1 + gt_bboxes_3d = gt_bboxes_3d[valid_gt] + gt_labels_3d = gt_labels_3d[valid_gt] + + # transform the bbox coordinate to the pointcloud coordinate + gt_bboxes_3d_tensor = gt_bboxes_3d.tensor.clone() + gt_bboxes_3d_tensor[..., 2] += gt_bboxes_3d_tensor[..., 5] / 2 + + points_mask, assignment = self._assign_targets_by_points_inside( + gt_bboxes_3d, points) + gt_bboxes_3d_tensor = gt_bboxes_3d_tensor[assignment] + mask_targets = gt_labels_3d[assignment] + + bbox_targets = self.bbox_coder.encode(gt_bboxes_3d_tensor, + points[..., 0:3], mask_targets) + + positive_mask = (points_mask.max(1)[0] > 0) + negative_mask = (points_mask.max(1)[0] == 0) + # add ignore_mask + extend_gt_bboxes_3d = gt_bboxes_3d.enlarged_box(self.enlarge_width) + points_mask, _ = self._assign_targets_by_points_inside( + extend_gt_bboxes_3d, points) + negative_mask = (points_mask.max(1)[0] == 0) + + point_targets = points[..., 0:3] + return (bbox_targets, mask_targets, positive_mask, negative_mask, + point_targets) + + def get_bboxes(self, + points, + bbox_preds, + cls_preds, + input_metas, + rescale=False): + """Generate bboxes from RPN head predictions. + + Args: + points (torch.Tensor): Input points. + bbox_preds (dict): Regression predictions from PointRCNN head. + cls_preds (dict): Class scores predictions from PointRCNN head. + input_metas (list[dict]): Point cloud and image's meta info. + rescale (bool, optional): Whether to rescale bboxes. + Defaults to False. + + Returns: + list[tuple[torch.Tensor]]: Bounding boxes, scores and labels. + """ + sem_scores = cls_preds.sigmoid() + obj_scores = sem_scores.max(-1)[0] + object_class = sem_scores.argmax(dim=-1) + + batch_size = sem_scores.shape[0] + results = list() + for b in range(batch_size): + bbox3d = self.bbox_coder.decode(bbox_preds[b], points[b, ..., :3], + object_class[b]) + bbox_selected, score_selected, labels, cls_preds_selected = \ + self.class_agnostic_nms(obj_scores[b], sem_scores[b], bbox3d, + points[b, ..., :3], input_metas[b]) + bbox = input_metas[b]['box_type_3d']( + bbox_selected.clone(), + box_dim=bbox_selected.shape[-1], + with_yaw=True) + results.append((bbox, score_selected, labels, cls_preds_selected)) + return results + + def class_agnostic_nms(self, obj_scores, sem_scores, bbox, points, + input_meta): + """Class agnostic nms. + + Args: + obj_scores (torch.Tensor): Objectness score of bounding boxes. + sem_scores (torch.Tensor): Semantic class score of bounding boxes. + bbox (torch.Tensor): Predicted bounding boxes. + + Returns: + tuple[torch.Tensor]: Bounding boxes, scores and labels. + """ + nms_cfg = self.test_cfg.nms_cfg if not self.training \ + else self.train_cfg.nms_cfg + if nms_cfg.use_rotate_nms: + nms_func = nms_gpu + else: + nms_func = nms_normal_gpu + + num_bbox = bbox.shape[0] + bbox = input_meta['box_type_3d']( + bbox.clone(), + box_dim=bbox.shape[-1], + with_yaw=True, + origin=(0.5, 0.5, 0.5)) + + if isinstance(bbox, LiDARInstance3DBoxes): + box_idx = bbox.points_in_boxes(points) + box_indices = box_idx.new_zeros([num_bbox + 1]) + box_idx[box_idx == -1] = num_bbox + box_indices.scatter_add_(0, box_idx.long(), + box_idx.new_ones(box_idx.shape)) + box_indices = box_indices[:-1] + nonempty_box_mask = box_indices >= 0 + elif isinstance(bbox, DepthInstance3DBoxes): + box_indices = bbox.points_in_boxes(points) + nonempty_box_mask = box_indices.T.sum(1) >= 0 + else: + raise NotImplementedError('Unsupported bbox type!') + + bbox = bbox.tensor[nonempty_box_mask] + + if self.test_cfg.score_thr is not None: + score_thr = self.test_cfg.score_thr + keep = (obj_scores >= score_thr) + obj_scores = obj_scores[keep] + sem_scores = sem_scores[keep] + bbox = bbox[keep] + + if obj_scores.shape[0] > 0: + topk = min(nms_cfg.nms_pre, obj_scores.shape[0]) + obj_scores_nms, indices = torch.topk(obj_scores, k=topk) + bbox_for_nms = bbox[indices] + sem_scores_nms = sem_scores[indices] + + keep = nms_func(bbox_for_nms[:, 0:7], obj_scores_nms, + nms_cfg.iou_thr) + keep = keep[:nms_cfg.nms_post] + + bbox_selected = bbox_for_nms[keep] + score_selected = obj_scores_nms[keep] + cls_preds = sem_scores_nms[keep] + labels = torch.argmax(cls_preds, -1) + + return bbox_selected, score_selected, labels, cls_preds + + def _assign_targets_by_points_inside(self, bboxes_3d, points): + """Compute assignment by checking whether point is inside bbox. + + Args: + bboxes_3d (:obj:`BaseInstance3DBoxes`): Instance of bounding boxes. + points (torch.Tensor): Points of a batch. + + Returns: + tuple[torch.Tensor]: Flags indicating whether each point is + inside bbox and the index of box where each point are in. + """ + # TODO: align points_in_boxes function in each box_structures + num_bbox = bboxes_3d.tensor.shape[0] + if isinstance(bboxes_3d, LiDARInstance3DBoxes): + assignment = bboxes_3d.points_in_boxes(points[:, 0:3]).long() + points_mask = assignment.new_zeros( + [assignment.shape[0], num_bbox + 1]) + assignment[assignment == -1] = num_bbox + points_mask.scatter_(1, assignment.unsqueeze(1), 1) + points_mask = points_mask[:, :-1] + assignment[assignment == num_bbox] = num_bbox - 1 + elif isinstance(bboxes_3d, DepthInstance3DBoxes): + points_mask = bboxes_3d.points_in_boxes(points) + assignment = points_mask.argmax(dim=-1) + else: + raise NotImplementedError('Unsupported bbox type!') + + return points_mask, assignment diff --git a/mmdet3d/models/dense_heads/shape_aware_head.py b/mmdet3d/models/dense_heads/shape_aware_head.py index 9b7e5eedb6..b9d6caae05 100644 --- a/mmdet3d/models/dense_heads/shape_aware_head.py +++ b/mmdet3d/models/dense_heads/shape_aware_head.py @@ -1,7 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. +import warnings + import numpy as np import torch -import warnings from mmcv.cnn import ConvModule from mmcv.runner import BaseModule from torch import nn as nn @@ -30,15 +31,17 @@ class BaseShapeHead(BaseModule): num_base_anchors (int): Number of anchors per location. box_code_size (int): The dimension of boxes to be encoded. in_channels (int): Input channels for convolutional layers. - shared_conv_channels (tuple): Channels for shared convolutional \ - layers. Default: (64, 64). \ - shared_conv_strides (tuple): Strides for shared convolutional \ - layers. Default: (1, 1). - use_direction_classifier (bool, optional): Whether to use direction \ + shared_conv_channels (tuple, optional): Channels for shared + convolutional layers. Default: (64, 64). + shared_conv_strides (tuple, optional): Strides for shared + convolutional layers. Default: (1, 1). + use_direction_classifier (bool, optional): Whether to use direction classifier. Default: True. - conv_cfg (dict): Config of conv layer. Default: dict(type='Conv2d') - norm_cfg (dict): Config of norm layer. Default: dict(type='BN2d'). - bias (bool|str, optional): Type of bias. Default: False. + conv_cfg (dict, optional): Config of conv layer. + Default: dict(type='Conv2d') + norm_cfg (dict, optional): Config of norm layer. + Default: dict(type='BN2d'). + bias (bool | str, optional): Type of bias. Default: False. """ def __init__(self, @@ -127,11 +130,11 @@ def forward(self, x): [B, C, H, W]. Returns: - dict[torch.Tensor]: Contain score of each class, bbox \ - regression and direction classification predictions. \ - Note that all the returned tensors are reshaped as \ - [bs*num_base_anchors*H*W, num_cls/box_code_size/dir_bins]. \ - It is more convenient to concat anchors for different \ + dict[torch.Tensor]: Contain score of each class, bbox + regression and direction classification predictions. + Note that all the returned tensors are reshaped as + [bs*num_base_anchors*H*W, num_cls/box_code_size/dir_bins]. + It is more convenient to concat anchors for different classes even though they have different feature map sizes. """ x = self.shared_conv(x) @@ -168,9 +171,9 @@ class ShapeAwareHead(Anchor3DHead): Args: tasks (dict): Shape-aware groups of multi-class objects. - assign_per_class (bool, optional): Whether to do assignment for each \ + assign_per_class (bool, optional): Whether to do assignment for each class. Default: True. - kwargs (dict): Other arguments are the same as those in \ + kwargs (dict): Other arguments are the same as those in :class:`Anchor3DHead`. """ @@ -217,7 +220,7 @@ def forward_single(self, x): Args: x (torch.Tensor): Input features. Returns: - tuple[torch.Tensor]: Contain score of each class, bbox \ + tuple[torch.Tensor]: Contain score of each class, bbox regression and direction classification predictions. """ results = [] @@ -263,7 +266,7 @@ def loss_single(self, cls_score, bbox_pred, dir_cls_preds, labels, num_total_samples (int): The number of valid samples. Returns: - tuple[torch.Tensor]: Losses of class, bbox \ + tuple[torch.Tensor]: Losses of class, bbox and direction, respectively. """ # classification loss @@ -325,16 +328,16 @@ class predictions. of each sample. gt_labels (list[torch.Tensor]): Gt labels of each sample. input_metas (list[dict]): Contain pcd and img's meta info. - gt_bboxes_ignore (None | list[torch.Tensor]): Specify + gt_bboxes_ignore (list[torch.Tensor]): Specify which bounding. Returns: - dict[str, list[torch.Tensor]]: Classification, bbox, and \ + dict[str, list[torch.Tensor]]: Classification, bbox, and direction losses of each level. - loss_cls (list[torch.Tensor]): Classification losses. - loss_bbox (list[torch.Tensor]): Box regression losses. - - loss_dir (list[torch.Tensor]): Direction classification \ + - loss_dir (list[torch.Tensor]): Direction classification losses. """ device = cls_scores[0].device @@ -388,7 +391,7 @@ def get_bboxes(self, dir_cls_preds (list[torch.Tensor]): Multi-level direction class predictions. input_metas (list[dict]): Contain pcd and img's meta info. - cfg (None | :obj:`ConfigDict`): Training or testing config. + cfg (:obj:`ConfigDict`, optional): Training or testing config. Default: None. rescale (list[torch.Tensor], optional): Whether to rescale bbox. Default: False. @@ -443,8 +446,8 @@ def get_bboxes_single(self, mlvl_anchors (List[torch.Tensor]): Multi-level anchors in single batch. input_meta (list[dict]): Contain pcd and img's meta info. - cfg (None | :obj:`ConfigDict`): Training or testing config. - rescale (list[torch.Tensor], optional): whether to rescale bbox. \ + cfg (:obj:`ConfigDict`): Training or testing config. + rescale (list[torch.Tensor], optional): whether to rescale bbox. Default: False. Returns: diff --git a/mmdet3d/models/dense_heads/smoke_mono3d_head.py b/mmdet3d/models/dense_heads/smoke_mono3d_head.py new file mode 100644 index 0000000000..0edf0ad022 --- /dev/null +++ b/mmdet3d/models/dense_heads/smoke_mono3d_head.py @@ -0,0 +1,516 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch.nn import functional as F + +from mmdet.core import multi_apply +from mmdet.core.bbox.builder import build_bbox_coder +from mmdet.models.builder import HEADS +from mmdet.models.utils import gaussian_radius, gen_gaussian_target +from mmdet.models.utils.gaussian_target import (get_local_maximum, + get_topk_from_heatmap, + transpose_and_gather_feat) +from .anchor_free_mono3d_head import AnchorFreeMono3DHead + + +@HEADS.register_module() +class SMOKEMono3DHead(AnchorFreeMono3DHead): + r"""Anchor-free head used in `SMOKE `_ + + .. code-block:: none + + /-----> 3*3 conv -----> 1*1 conv -----> cls + feature + \-----> 3*3 conv -----> 1*1 conv -----> reg + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + dim_channel (list[int]): indices of dimension offset preds in + regression heatmap channels. + ori_channel (list[int]): indices of orientation offset pred in + regression heatmap channels. + bbox_coder (:obj:`CameraInstance3DBoxes`): Bbox coder + for encoding and decoding boxes. + loss_cls (dict, optional): Config of classification loss. + Default: loss_cls=dict(type='GaussionFocalLoss', loss_weight=1.0). + loss_bbox (dict, optional): Config of localization loss. + Default: loss_bbox=dict(type='L1Loss', loss_weight=10.0). + loss_dir (dict, optional): Config of direction classification loss. + In SMOKE, Default: None. + loss_attr (dict, optional): Config of attribute classification loss. + In SMOKE, Default: None. + loss_centerness (dict): Config of centerness loss. + norm_cfg (dict): Dictionary to construct and config norm layer. + Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True). + init_cfg (dict): Initialization config dict. Default: None. + """ # noqa: E501 + + def __init__(self, + num_classes, + in_channels, + dim_channel, + ori_channel, + bbox_coder, + loss_cls=dict(type='GaussionFocalLoss', loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=0.1), + loss_dir=None, + loss_attr=None, + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True), + init_cfg=None, + **kwargs): + super().__init__( + num_classes, + in_channels, + loss_cls=loss_cls, + loss_bbox=loss_bbox, + loss_dir=loss_dir, + loss_attr=loss_attr, + norm_cfg=norm_cfg, + init_cfg=init_cfg, + **kwargs) + self.dim_channel = dim_channel + self.ori_channel = ori_channel + self.bbox_coder = build_bbox_coder(bbox_coder) + + def forward(self, feats): + """Forward features from the upstream network. + + Args: + feats (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: + cls_scores (list[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_points * num_classes. + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_points * bbox_code_size. + """ + return multi_apply(self.forward_single, feats) + + def forward_single(self, x): + """Forward features of a single scale level. + + Args: + x (Tensor): Input feature map. + + Returns: + tuple: Scores for each class, bbox of input feature maps. + """ + cls_score, bbox_pred, dir_cls_pred, attr_pred, cls_feat, reg_feat = \ + super().forward_single(x) + cls_score = cls_score.sigmoid() # turn to 0-1 + cls_score = cls_score.clamp(min=1e-4, max=1 - 1e-4) + # (N, C, H, W) + offset_dims = bbox_pred[:, self.dim_channel, ...] + bbox_pred[:, self.dim_channel, ...] = offset_dims.sigmoid() - 0.5 + # (N, C, H, W) + vector_ori = bbox_pred[:, self.ori_channel, ...] + bbox_pred[:, self.ori_channel, ...] = F.normalize(vector_ori) + return cls_score, bbox_pred + + def get_bboxes(self, cls_scores, bbox_preds, img_metas, rescale=None): + """Generate bboxes from bbox head predictions. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level. + bbox_preds (list[Tensor]): Box regression for each scale. + img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + rescale (bool): If True, return boxes in original image space. + + Returns: + list[tuple[:obj:`CameraInstance3DBoxes`, Tensor, Tensor, None]]: + Each item in result_list is 4-tuple. + """ + assert len(cls_scores) == len(bbox_preds) == 1 + cam2imgs = torch.stack([ + cls_scores[0].new_tensor(img_meta['cam2img']) + for img_meta in img_metas + ]) + trans_mats = torch.stack([ + cls_scores[0].new_tensor(img_meta['trans_mat']) + for img_meta in img_metas + ]) + batch_bboxes, batch_scores, batch_topk_labels = self.decode_heatmap( + cls_scores[0], + bbox_preds[0], + img_metas, + cam2imgs=cam2imgs, + trans_mats=trans_mats, + topk=100, + kernel=3) + + result_list = [] + for img_id in range(len(img_metas)): + + bboxes = batch_bboxes[img_id] + scores = batch_scores[img_id] + labels = batch_topk_labels[img_id] + + keep_idx = scores > 0.25 + bboxes = bboxes[keep_idx] + scores = scores[keep_idx] + labels = labels[keep_idx] + + bboxes = img_metas[img_id]['box_type_3d']( + bboxes, box_dim=self.bbox_code_size, origin=(0.5, 0.5, 0.5)) + attrs = None + result_list.append((bboxes, scores, labels, attrs)) + + return result_list + + def decode_heatmap(self, + cls_score, + reg_pred, + img_metas, + cam2imgs, + trans_mats, + topk=100, + kernel=3): + """Transform outputs into detections raw bbox predictions. + + Args: + class_score (Tensor): Center predict heatmap, + shape (B, num_classes, H, W). + reg_pred (Tensor): Box regression map. + shape (B, channel, H , W). + img_metas (List[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + cam2imgs (Tensor): Camera intrinsic matrixs. + shape (B, 4, 4) + trans_mats (Tensor): Transformation matrix from original image + to feature map. + shape: (batch, 3, 3) + topk (int): Get top k center keypoints from heatmap. Default 100. + kernel (int): Max pooling kernel for extract local maximum pixels. + Default 3. + + Returns: + tuple[torch.Tensor]: Decoded output of SMOKEHead, containing + the following Tensors: + - batch_bboxes (Tensor): Coords of each 3D box. + shape (B, k, 7) + - batch_scores (Tensor): Scores of each 3D box. + shape (B, k) + - batch_topk_labels (Tensor): Categories of each 3D box. + shape (B, k) + """ + img_h, img_w = img_metas[0]['pad_shape'][:2] + bs, _, feat_h, feat_w = cls_score.shape + + center_heatmap_pred = get_local_maximum(cls_score, kernel=kernel) + + *batch_dets, topk_ys, topk_xs = get_topk_from_heatmap( + center_heatmap_pred, k=topk) + batch_scores, batch_index, batch_topk_labels = batch_dets + + regression = transpose_and_gather_feat(reg_pred, batch_index) + regression = regression.view(-1, 8) + + points = torch.cat([topk_xs.view(-1, 1), + topk_ys.view(-1, 1).float()], + dim=1) + locations, dimensions, orientations = self.bbox_coder.decode( + regression, points, batch_topk_labels, cam2imgs, trans_mats) + + batch_bboxes = torch.cat((locations, dimensions, orientations), dim=1) + batch_bboxes = batch_bboxes.view(bs, -1, self.bbox_code_size) + return batch_bboxes, batch_scores, batch_topk_labels + + def get_predictions(self, labels3d, centers2d, gt_locations, gt_dimensions, + gt_orientations, indices, img_metas, pred_reg): + """Prepare predictions for computing loss. + + Args: + labels3d (Tensor): Labels of each 3D box. + shape (B, max_objs, ) + centers2d (Tensor): Coords of each projected 3D box + center on image. shape (B * max_objs, 2) + gt_locations (Tensor): Coords of each 3D box's location. + shape (B * max_objs, 3) + gt_dimensions (Tensor): Dimensions of each 3D box. + shape (N, 3) + gt_orientations (Tensor): Orientation(yaw) of each 3D box. + shape (N, 1) + indices (Tensor): Indices of the existence of the 3D box. + shape (B * max_objs, ) + img_metas (list[dict]): Meta information of each image, + e.g., image size, scaling factor, etc. + pre_reg (Tensor): Box regression map. + shape (B, channel, H , W). + + Returns: + dict: the dict has components below: + - bbox3d_yaws (:obj:`CameraInstance3DBoxes`): + bbox calculated using pred orientations. + - bbox3d_dims (:obj:`CameraInstance3DBoxes`): + bbox calculated using pred dimensions. + - bbox3d_locs (:obj:`CameraInstance3DBoxes`): + bbox calculated using pred locations. + """ + batch, channel = pred_reg.shape[0], pred_reg.shape[1] + w = pred_reg.shape[3] + cam2imgs = torch.stack([ + gt_locations.new_tensor(img_meta['cam2img']) + for img_meta in img_metas + ]) + trans_mats = torch.stack([ + gt_locations.new_tensor(img_meta['trans_mat']) + for img_meta in img_metas + ]) + centers2d_inds = centers2d[:, 1] * w + centers2d[:, 0] + centers2d_inds = centers2d_inds.view(batch, -1) + pred_regression = transpose_and_gather_feat(pred_reg, centers2d_inds) + pred_regression_pois = pred_regression.view(-1, channel) + locations, dimensions, orientations = self.bbox_coder.decode( + pred_regression_pois, centers2d, labels3d, cam2imgs, trans_mats, + gt_locations) + + locations, dimensions, orientations = locations[indices], dimensions[ + indices], orientations[indices] + + locations[:, 1] += dimensions[:, 1] / 2 + + gt_locations = gt_locations[indices] + + assert len(locations) == len(gt_locations) + assert len(dimensions) == len(gt_dimensions) + assert len(orientations) == len(gt_orientations) + bbox3d_yaws = self.bbox_coder.encode(gt_locations, gt_dimensions, + orientations, img_metas) + bbox3d_dims = self.bbox_coder.encode(gt_locations, dimensions, + gt_orientations, img_metas) + bbox3d_locs = self.bbox_coder.encode(locations, gt_dimensions, + gt_orientations, img_metas) + + pred_bboxes = dict(ori=bbox3d_yaws, dim=bbox3d_dims, loc=bbox3d_locs) + + return pred_bboxes + + def get_targets(self, gt_bboxes, gt_labels, gt_bboxes_3d, gt_labels_3d, + centers2d, feat_shape, img_shape, img_metas): + """Get training targets for batch images. + + Args: + gt_bboxes (list[Tensor]): Ground truth bboxes of each image, + shape (num_gt, 4). + gt_labels (list[Tensor]): Ground truth labels of each box, + shape (num_gt,). + gt_bboxes_3d (list[:obj:`CameraInstance3DBoxes`]): 3D Ground + truth bboxes of each image, + shape (num_gt, bbox_code_size). + gt_labels_3d (list[Tensor]): 3D Ground truth labels of each + box, shape (num_gt,). + centers2d (list[Tensor]): Projected 3D centers onto 2D image, + shape (num_gt, 2). + feat_shape (tuple[int]): Feature map shape with value, + shape (B, _, H, W). + img_shape (tuple[int]): Image shape in [h, w] format. + img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + + Returns: + tuple[Tensor, dict]: The Tensor value is the targets of + center heatmap, the dict has components below: + - gt_centers2d (Tensor): Coords of each projected 3D box + center on image. shape (B * max_objs, 2) + - gt_labels3d (Tensor): Labels of each 3D box. + shape (B, max_objs, ) + - indices (Tensor): Indices of the existence of the 3D box. + shape (B * max_objs, ) + - affine_indices (Tensor): Indices of the affine of the 3D box. + shape (N, ) + - gt_locs (Tensor): Coords of each 3D box's location. + shape (N, 3) + - gt_dims (Tensor): Dimensions of each 3D box. + shape (N, 3) + - gt_yaws (Tensor): Orientation(yaw) of each 3D box. + shape (N, 1) + - gt_cors (Tensor): Coords of the corners of each 3D box. + shape (N, 8, 3) + """ + + reg_mask = torch.stack([ + gt_bboxes[0].new_tensor( + not img_meta['affine_aug'], dtype=torch.bool) + for img_meta in img_metas + ]) + + img_h, img_w = img_shape[:2] + bs, _, feat_h, feat_w = feat_shape + + width_ratio = float(feat_w / img_w) # 1/4 + height_ratio = float(feat_h / img_h) # 1/4 + + assert width_ratio == height_ratio + + center_heatmap_target = gt_bboxes[-1].new_zeros( + [bs, self.num_classes, feat_h, feat_w]) + + gt_centers2d = centers2d.copy() + + for batch_id in range(bs): + gt_bbox = gt_bboxes[batch_id] + gt_label = gt_labels[batch_id] + # project centers2d from input image to feat map + gt_center2d = gt_centers2d[batch_id] * width_ratio + + for j, center in enumerate(gt_center2d): + center_x_int, center_y_int = center.int() + scale_box_h = (gt_bbox[j][3] - gt_bbox[j][1]) * height_ratio + scale_box_w = (gt_bbox[j][2] - gt_bbox[j][0]) * width_ratio + radius = gaussian_radius([scale_box_h, scale_box_w], + min_overlap=0.7) + radius = max(0, int(radius)) + ind = gt_label[j] + gen_gaussian_target(center_heatmap_target[batch_id, ind], + [center_x_int, center_y_int], radius) + + avg_factor = max(1, center_heatmap_target.eq(1).sum()) + num_ctrs = [center2d.shape[0] for center2d in centers2d] + max_objs = max(num_ctrs) + + reg_inds = torch.cat( + [reg_mask[i].repeat(num_ctrs[i]) for i in range(bs)]) + + inds = torch.zeros((bs, max_objs), + dtype=torch.bool).to(centers2d[0].device) + + # put gt 3d bboxes to gpu + gt_bboxes_3d = [ + gt_bbox_3d.to(centers2d[0].device) for gt_bbox_3d in gt_bboxes_3d + ] + + batch_centers2d = centers2d[0].new_zeros((bs, max_objs, 2)) + batch_labels_3d = gt_labels_3d[0].new_zeros((bs, max_objs)) + batch_gt_locations = \ + gt_bboxes_3d[0].tensor.new_zeros((bs, max_objs, 3)) + for i in range(bs): + inds[i, :num_ctrs[i]] = 1 + batch_centers2d[i, :num_ctrs[i]] = centers2d[i] + batch_labels_3d[i, :num_ctrs[i]] = gt_labels_3d[i] + batch_gt_locations[i, :num_ctrs[i]] = \ + gt_bboxes_3d[i].tensor[:, :3] + + inds = inds.flatten() + batch_centers2d = batch_centers2d.view(-1, 2) * width_ratio + batch_gt_locations = batch_gt_locations.view(-1, 3) + + # filter the empty image, without gt_bboxes_3d + gt_bboxes_3d = [ + gt_bbox_3d for gt_bbox_3d in gt_bboxes_3d + if gt_bbox_3d.tensor.shape[0] > 0 + ] + + gt_dimensions = torch.cat( + [gt_bbox_3d.tensor[:, 3:6] for gt_bbox_3d in gt_bboxes_3d]) + gt_orientations = torch.cat([ + gt_bbox_3d.tensor[:, 6].unsqueeze(-1) + for gt_bbox_3d in gt_bboxes_3d + ]) + gt_corners = torch.cat( + [gt_bbox_3d.corners for gt_bbox_3d in gt_bboxes_3d]) + + target_labels = dict( + gt_centers2d=batch_centers2d.long(), + gt_labels3d=batch_labels_3d, + indices=inds, + reg_indices=reg_inds, + gt_locs=batch_gt_locations, + gt_dims=gt_dimensions, + gt_yaws=gt_orientations, + gt_cors=gt_corners) + + return center_heatmap_target, avg_factor, target_labels + + def loss(self, + cls_scores, + bbox_preds, + gt_bboxes, + gt_labels, + gt_bboxes_3d, + gt_labels_3d, + centers2d, + depths, + attr_labels, + img_metas, + gt_bboxes_ignore=None): + """Compute loss of the head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level. + shape (num_gt, 4). + bbox_preds (list[Tensor]): Box dims is a 4D-tensor, the channel + number is bbox_code_size. + shape (B, 7, H, W). + gt_bboxes (list[Tensor]): Ground truth bboxes for each image. + shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels (list[Tensor]): Class indices corresponding to each box. + shape (num_gts, ). + gt_bboxes_3d (list[:obj:`CameraInstance3DBoxes`]): 3D boxes ground + truth. it is the flipped gt_bboxes + gt_labels_3d (list[Tensor]): Same as gt_labels. + centers2d (list[Tensor]): 2D centers on the image. + shape (num_gts, 2). + depths (list[Tensor]): Depth ground truth. + shape (num_gts, ). + attr_labels (list[Tensor]): Attributes indices of each box. + In kitti it's None. + img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + gt_bboxes_ignore (None | list[Tensor]): Specify which bounding + boxes can be ignored when computing the loss. + Default: None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + assert len(cls_scores) == len(bbox_preds) == 1 + assert attr_labels is None + assert gt_bboxes_ignore is None + center2d_heatmap = cls_scores[0] + pred_reg = bbox_preds[0] + + center2d_heatmap_target, avg_factor, target_labels = \ + self.get_targets(gt_bboxes, gt_labels, gt_bboxes_3d, + gt_labels_3d, centers2d, + center2d_heatmap.shape, + img_metas[0]['pad_shape'], + img_metas) + + pred_bboxes = self.get_predictions( + labels3d=target_labels['gt_labels3d'], + centers2d=target_labels['gt_centers2d'], + gt_locations=target_labels['gt_locs'], + gt_dimensions=target_labels['gt_dims'], + gt_orientations=target_labels['gt_yaws'], + indices=target_labels['indices'], + img_metas=img_metas, + pred_reg=pred_reg) + + loss_cls = self.loss_cls( + center2d_heatmap, center2d_heatmap_target, avg_factor=avg_factor) + + reg_inds = target_labels['reg_indices'] + + loss_bbox_oris = self.loss_bbox( + pred_bboxes['ori'].corners[reg_inds, ...], + target_labels['gt_cors'][reg_inds, ...]) + + loss_bbox_dims = self.loss_bbox( + pred_bboxes['dim'].corners[reg_inds, ...], + target_labels['gt_cors'][reg_inds, ...]) + + loss_bbox_locs = self.loss_bbox( + pred_bboxes['loc'].corners[reg_inds, ...], + target_labels['gt_cors'][reg_inds, ...]) + + loss_bbox = loss_bbox_dims + loss_bbox_locs + loss_bbox_oris + + loss_dict = dict(loss_cls=loss_cls, loss_bbox=loss_bbox) + + return loss_dict diff --git a/mmdet3d/models/dense_heads/ssd_3d_head.py b/mmdet3d/models/dense_heads/ssd_3d_head.py index 9e936fb144..85c60a7e1d 100644 --- a/mmdet3d/models/dense_heads/ssd_3d_head.py +++ b/mmdet3d/models/dense_heads/ssd_3d_head.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -import numpy as np import torch from mmcv.ops.nms import batched_nms from mmcv.runner import force_fp32 @@ -128,15 +127,15 @@ def loss(self, Args: bbox_preds (dict): Predictions from forward of SSD3DHead. points (list[torch.Tensor]): Input points. - gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \ + gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth bboxes of each sample. gt_labels_3d (list[torch.Tensor]): Labels of each sample. - pts_semantic_mask (None | list[torch.Tensor]): Point-wise + pts_semantic_mask (list[torch.Tensor]): Point-wise semantic mask. - pts_instance_mask (None | list[torch.Tensor]): Point-wise + pts_instance_mask (list[torch.Tensor]): Point-wise instance mask. img_metas (list[dict]): Contain pcd and img's meta info. - gt_bboxes_ignore (None | list[torch.Tensor]): Specify + gt_bboxes_ignore (list[torch.Tensor]): Specify which bounding. Returns: @@ -231,12 +230,12 @@ def get_targets(self, Args: points (list[torch.Tensor]): Points of each batch. - gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \ + gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth bboxes of each batch. gt_labels_3d (list[torch.Tensor]): Labels of each batch. - pts_semantic_mask (None | list[torch.Tensor]): Point-wise semantic + pts_semantic_mask (list[torch.Tensor]): Point-wise semantic label of each batch. - pts_instance_mask (None | list[torch.Tensor]): Point-wise instance + pts_instance_mask (list[torch.Tensor]): Point-wise instance label of each batch. bbox_preds (torch.Tensor): Bounding box predictions of ssd3d head. @@ -320,12 +319,12 @@ def get_targets_single(self, Args: points (torch.Tensor): Points of each batch. - gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth \ + gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth boxes of each batch. gt_labels_3d (torch.Tensor): Labels of each batch. - pts_semantic_mask (None | torch.Tensor): Point-wise semantic + pts_semantic_mask (torch.Tensor): Point-wise semantic label of each batch. - pts_instance_mask (None | torch.Tensor): Point-wise instance + pts_instance_mask (torch.Tensor): Point-wise instance label of each batch. aggregated_points (torch.Tensor): Aggregated points from candidate points layer. @@ -392,7 +391,8 @@ def get_targets_single(self, # LiDARInstance3DBoxes and DepthInstance3DBoxes canonical_xyz = rotation_3d_in_axis( canonical_xyz.unsqueeze(0).transpose(0, 1), - -gt_bboxes_3d.yaw[assignment], 2).squeeze(1) + -gt_bboxes_3d.yaw[assignment], + axis=2).squeeze(1) distance_front = torch.clamp( size_res_targets[:, 0] - canonical_xyz[:, 0], min=0) distance_back = torch.clamp( @@ -441,7 +441,7 @@ def get_targets_single(self, negative_mask) def get_bboxes(self, points, bbox_preds, input_metas, rescale=False): - """Generate bboxes from sdd3d head predictions. + """Generate bboxes from 3DSSD head predictions. Args: points (torch.Tensor): Input points. @@ -464,9 +464,7 @@ def get_bboxes(self, points, bbox_preds, input_metas, rescale=False): bbox_selected, score_selected, labels = self.multiclass_nms_single( obj_scores[b], sem_scores[b], bbox3d[b], points[b, ..., :3], input_metas[b]) - # fix the wrong direction - # To do: remove this ops - bbox_selected[..., 6] += np.pi + bbox = input_metas[b]['box_type_3d']( bbox_selected.clone(), box_dim=bbox_selected.shape[-1], @@ -481,7 +479,7 @@ def multiclass_nms_single(self, obj_scores, sem_scores, bbox, points, Args: obj_scores (torch.Tensor): Objectness score of bounding boxes. - sem_scores (torch.Tensor): semantic class score of bounding boxes. + sem_scores (torch.Tensor): Semantic class score of bounding boxes. bbox (torch.Tensor): Predicted bounding boxes. points (torch.Tensor): Input points. input_meta (dict): Point cloud and image's meta info. @@ -489,23 +487,14 @@ def multiclass_nms_single(self, obj_scores, sem_scores, bbox, points, Returns: tuple[torch.Tensor]: Bounding boxes, scores and labels. """ - num_bbox = bbox.shape[0] bbox = input_meta['box_type_3d']( bbox.clone(), box_dim=bbox.shape[-1], with_yaw=self.bbox_coder.with_rot, origin=(0.5, 0.5, 0.5)) - if isinstance(bbox, LiDARInstance3DBoxes): - box_idx = bbox.points_in_boxes(points) - box_indices = box_idx.new_zeros([num_bbox + 1]) - box_idx[box_idx == -1] = num_bbox - box_indices.scatter_add_(0, box_idx.long(), - box_idx.new_ones(box_idx.shape)) - box_indices = box_indices[:-1] - nonempty_box_mask = box_indices >= 0 - elif isinstance(bbox, DepthInstance3DBoxes): - box_indices = bbox.points_in_boxes(points) + if isinstance(bbox, (LiDARInstance3DBoxes, DepthInstance3DBoxes)): + box_indices = bbox.points_in_boxes_all(points) nonempty_box_mask = box_indices.T.sum(1) >= 0 else: raise NotImplementedError('Unsupported bbox type!') @@ -516,20 +505,20 @@ def multiclass_nms_single(self, obj_scores, sem_scores, bbox, points, minmax_box3d[:, 3:] = torch.max(corner3d, dim=1)[0] bbox_classes = torch.argmax(sem_scores, -1) - nms_selected = batched_nms( + nms_keep = batched_nms( minmax_box3d[nonempty_box_mask][:, [0, 1, 3, 4]], obj_scores[nonempty_box_mask], bbox_classes[nonempty_box_mask], self.test_cfg.nms_cfg)[1] - if nms_selected.shape[0] > self.test_cfg.max_output_num: - nms_selected = nms_selected[:self.test_cfg.max_output_num] + if nms_keep.shape[0] > self.test_cfg.max_output_num: + nms_keep = nms_keep[:self.test_cfg.max_output_num] # filter empty boxes and boxes with low score scores_mask = (obj_scores >= self.test_cfg.score_thr) nonempty_box_inds = torch.nonzero( nonempty_box_mask, as_tuple=False).flatten() nonempty_mask = torch.zeros_like(bbox_classes).scatter( - 0, nonempty_box_inds[nms_selected], 1) + 0, nonempty_box_inds[nms_keep], 1) selected = (nonempty_mask.bool() & scores_mask.bool()) if self.test_cfg.per_class_proposal: @@ -560,18 +549,8 @@ def _assign_targets_by_points_inside(self, bboxes_3d, points): tuple[torch.Tensor]: Flags indicating whether each point is inside bbox and the index of box where each point are in. """ - # TODO: align points_in_boxes function in each box_structures - num_bbox = bboxes_3d.tensor.shape[0] - if isinstance(bboxes_3d, LiDARInstance3DBoxes): - assignment = bboxes_3d.points_in_boxes(points).long() - points_mask = assignment.new_zeros( - [assignment.shape[0], num_bbox + 1]) - assignment[assignment == -1] = num_bbox - points_mask.scatter_(1, assignment.unsqueeze(1), 1) - points_mask = points_mask[:, :-1] - assignment[assignment == num_bbox] = num_bbox - 1 - elif isinstance(bboxes_3d, DepthInstance3DBoxes): - points_mask = bboxes_3d.points_in_boxes(points) + if isinstance(bboxes_3d, (LiDARInstance3DBoxes, DepthInstance3DBoxes)): + points_mask = bboxes_3d.points_in_boxes_all(points) assignment = points_mask.argmax(dim=-1) else: raise NotImplementedError('Unsupported bbox type!') diff --git a/mmdet3d/models/dense_heads/train_mixins.py b/mmdet3d/models/dense_heads/train_mixins.py index c0bcf12869..90c9cbbfdc 100644 --- a/mmdet3d/models/dense_heads/train_mixins.py +++ b/mmdet3d/models/dense_heads/train_mixins.py @@ -25,7 +25,7 @@ def anchor_target_3d(self, gt_bboxes_list (list[:obj:`BaseInstance3DBoxes`]): Ground truth bboxes of each image. input_metas (list[dict]): Meta info of each image. - gt_bboxes_ignore_list (None | list): Ignore list of gt bboxes. + gt_bboxes_ignore_list (list): Ignore list of gt bboxes. gt_labels_list (list[torch.Tensor]): Gt labels of batches. label_channels (int): The channel of labels. num_classes (int): The number of classes. @@ -35,7 +35,7 @@ def anchor_target_3d(self, tuple (list, list, list, list, list, list, int, int): Anchor targets, including labels, label weights, bbox targets, bbox weights, direction targets, - direction weights, number of postive anchors and + direction weights, number of positive anchors and number of negative anchors. """ num_imgs = len(input_metas) @@ -293,6 +293,7 @@ def anchor_target_single_assigner(self, sampling_result.pos_bboxes, pos_bbox_targets, self.dir_offset, + self.dir_limit_offset, one_hot=False) bbox_targets[pos_inds, :] = pos_bbox_targets bbox_weights[pos_inds, :] = 1.0 @@ -318,6 +319,7 @@ def anchor_target_single_assigner(self, def get_direction_target(anchors, reg_targets, dir_offset=0, + dir_limit_offset=0, num_bins=2, one_hot=True): """Encode direction to 0 ~ num_bins-1. @@ -333,7 +335,7 @@ def get_direction_target(anchors, torch.Tensor: Encoded direction targets. """ rot_gt = reg_targets[..., 6] + anchors[..., 6] - offset_rot = limit_period(rot_gt - dir_offset, 0, 2 * np.pi) + offset_rot = limit_period(rot_gt - dir_offset, dir_limit_offset, 2 * np.pi) dir_cls_targets = torch.floor(offset_rot / (2 * np.pi / num_bins)).long() dir_cls_targets = torch.clamp(dir_cls_targets, min=0, max=num_bins - 1) if one_hot: diff --git a/mmdet3d/models/dense_heads/vote_head.py b/mmdet3d/models/dense_heads/vote_head.py index 32e4fba1ab..691429513b 100644 --- a/mmdet3d/models/dense_heads/vote_head.py +++ b/mmdet3d/models/dense_heads/vote_head.py @@ -136,7 +136,7 @@ def forward(self, feat_dict, sample_mod): """Forward pass. Note: - The forward of VoteHead is devided into 4 steps: + The forward of VoteHead is divided into 4 steps: 1. Generate vote_points from seed_points. 2. Aggregate vote_points. @@ -234,15 +234,15 @@ def loss(self, Args: bbox_preds (dict): Predictions from forward of vote head. points (list[torch.Tensor]): Input points. - gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \ + gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth bboxes of each sample. gt_labels_3d (list[torch.Tensor]): Labels of each sample. - pts_semantic_mask (None | list[torch.Tensor]): Point-wise + pts_semantic_mask (list[torch.Tensor]): Point-wise semantic mask. - pts_instance_mask (None | list[torch.Tensor]): Point-wise + pts_instance_mask (list[torch.Tensor]): Point-wise instance mask. img_metas (list[dict]): Contain pcd and img's meta info. - gt_bboxes_ignore (None | list[torch.Tensor]): Specify + gt_bboxes_ignore (list[torch.Tensor]): Specify which bounding. ret_target (Bool): Return targets or not. @@ -358,12 +358,12 @@ def get_targets(self, Args: points (list[torch.Tensor]): Points of each batch. - gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \ + gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth bboxes of each batch. gt_labels_3d (list[torch.Tensor]): Labels of each batch. - pts_semantic_mask (None | list[torch.Tensor]): Point-wise semantic + pts_semantic_mask (list[torch.Tensor]): Point-wise semantic label of each batch. - pts_instance_mask (None | list[torch.Tensor]): Point-wise instance + pts_instance_mask (list[torch.Tensor]): Point-wise instance label of each batch. bbox_preds (torch.Tensor): Bounding box predictions of vote head. @@ -447,12 +447,12 @@ def get_targets_single(self, Args: points (torch.Tensor): Points of each batch. - gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth \ + gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth boxes of each batch. gt_labels_3d (torch.Tensor): Labels of each batch. - pts_semantic_mask (None | torch.Tensor): Point-wise semantic + pts_semantic_mask (torch.Tensor): Point-wise semantic label of each batch. - pts_instance_mask (None | torch.Tensor): Point-wise instance + pts_instance_mask (torch.Tensor): Point-wise instance label of each batch. aggregated_points (torch.Tensor): Aggregated points from vote aggregation layer. @@ -471,7 +471,7 @@ def get_targets_single(self, vote_target_masks = points.new_zeros([num_points], dtype=torch.long) vote_target_idx = points.new_zeros([num_points], dtype=torch.long) - box_indices_all = gt_bboxes_3d.points_in_boxes(points) + box_indices_all = gt_bboxes_3d.points_in_boxes_all(points) for i in range(gt_labels_3d.shape[0]): box_indices = box_indices_all[:, i] indices = torch.nonzero( @@ -621,7 +621,7 @@ def multiclass_nms_single(self, obj_scores, sem_scores, bbox, points, box_dim=bbox.shape[-1], with_yaw=self.bbox_coder.with_rot, origin=(0.5, 0.5, 0.5)) - box_indices = bbox.points_in_boxes(points) + box_indices = bbox.points_in_boxes_all(points) corner3d = bbox.corners minmax_box3d = corner3d.new(torch.Size((corner3d.shape[0], 6))) diff --git a/mmdet3d/models/detectors/__init__.py b/mmdet3d/models/detectors/__init__.py index a07454b6cc..894d7f33cd 100644 --- a/mmdet3d/models/detectors/__init__.py +++ b/mmdet3d/models/detectors/__init__.py @@ -10,7 +10,9 @@ from .mvx_faster_rcnn import DynamicMVXFasterRCNN, MVXFasterRCNN from .mvx_two_stage import MVXTwoStageDetector from .parta2 import PartA2 +from .point_rcnn import PointRCNN from .single_stage_mono3d import SingleStageMono3DDetector +from .smoke_mono3d import SMOKEMono3D from .ssd3dnet import SSD3DNet from .votenet import VoteNet from .voxelnet import VoxelNet @@ -19,5 +21,5 @@ 'Base3DDetector', 'VoxelNet', 'DynamicVoxelNet', 'MVXTwoStageDetector', 'DynamicMVXFasterRCNN', 'MVXFasterRCNN', 'PartA2', 'VoteNet', 'H3DNet', 'CenterPoint', 'SSD3DNet', 'ImVoteNet', 'SingleStageMono3DDetector', - 'FCOSMono3D', 'ImVoxelNet', 'GroupFree3DNet' + 'FCOSMono3D', 'ImVoxelNet', 'GroupFree3DNet', 'PointRCNN', 'SMOKEMono3D' ] diff --git a/mmdet3d/models/detectors/base.py b/mmdet3d/models/detectors/base.py index 991951cd9c..4985c1dc67 100644 --- a/mmdet3d/models/detectors/base.py +++ b/mmdet3d/models/detectors/base.py @@ -1,9 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. +from os import path as osp + import mmcv import torch from mmcv.parallel import DataContainer as DC from mmcv.runner import auto_fp16 -from os import path as osp from mmdet3d.core import Box3DMode, Coord3DMode, show_result from mmdet.models.detectors import BaseDetector @@ -114,7 +115,7 @@ def show_results(self, data, result, out_dir, show=False, score_thr=None): Box3DMode.DEPTH) elif box_mode_3d != Box3DMode.DEPTH: ValueError( - f'Unsupported box_mode_3d {box_mode_3d} for convertion!') + f'Unsupported box_mode_3d {box_mode_3d} for conversion!') pred_bboxes = pred_bboxes.tensor.cpu().numpy() show_result( points, diff --git a/mmdet3d/models/detectors/centerpoint.py b/mmdet3d/models/detectors/centerpoint.py index 640bb8b338..ef34810d19 100644 --- a/mmdet3d/models/detectors/centerpoint.py +++ b/mmdet3d/models/detectors/centerpoint.py @@ -97,7 +97,8 @@ def aug_test_pts(self, feats, img_metas, rescale=False): Args: feats (list[torch.Tensor]): Feature of point cloud. img_metas (list[dict]): Meta information of samples. - rescale (bool): Whether to rescale bboxes. Default: False. + rescale (bool, optional): Whether to rescale bboxes. + Default: False. Returns: dict: Returned bboxes consists of the following keys: @@ -121,8 +122,8 @@ def aug_test_pts(self, feats, img_metas, rescale=False): task_id][0][key][:, 1, ...] elif key == 'rot': outs[task_id][0][ - key][:, 1, - ...] = -outs[task_id][0][key][:, 1, ...] + key][:, 0, + ...] = -outs[task_id][0][key][:, 0, ...] elif key == 'vel': outs[task_id][0][ key][:, 1, @@ -135,8 +136,8 @@ def aug_test_pts(self, feats, img_metas, rescale=False): task_id][0][key][:, 0, ...] elif key == 'rot': outs[task_id][0][ - key][:, 0, - ...] = -outs[task_id][0][key][:, 0, ...] + key][:, 1, + ...] = -outs[task_id][0][key][:, 1, ...] elif key == 'vel': outs[task_id][0][ key][:, 0, diff --git a/mmdet3d/models/detectors/groupfree3dnet.py b/mmdet3d/models/detectors/groupfree3dnet.py index 1260e868b0..52b3fe8184 100644 --- a/mmdet3d/models/detectors/groupfree3dnet.py +++ b/mmdet3d/models/detectors/groupfree3dnet.py @@ -38,11 +38,11 @@ def forward_train(self, img_metas (list): Image metas. gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch. gt_labels_3d (list[torch.Tensor]): gt class labels of each batch. - pts_semantic_mask (None | list[torch.Tensor]): point-wise semantic + pts_semantic_mask (list[torch.Tensor]): point-wise semantic label of each batch. - pts_instance_mask (None | list[torch.Tensor]): point-wise instance + pts_instance_mask (list[torch.Tensor]): point-wise instance label of each batch. - gt_bboxes_ignore (None | list[torch.Tensor]): Specify + gt_bboxes_ignore (list[torch.Tensor]): Specify which bounding. Returns: diff --git a/mmdet3d/models/detectors/h3dnet.py b/mmdet3d/models/detectors/h3dnet.py index d7bf8e29a7..bbae09a8df 100644 --- a/mmdet3d/models/detectors/h3dnet.py +++ b/mmdet3d/models/detectors/h3dnet.py @@ -47,11 +47,11 @@ def forward_train(self, img_metas (list): Image metas. gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch. gt_labels_3d (list[torch.Tensor]): gt class labels of each batch. - pts_semantic_mask (None | list[torch.Tensor]): point-wise semantic + pts_semantic_mask (list[torch.Tensor]): point-wise semantic label of each batch. - pts_instance_mask (None | list[torch.Tensor]): point-wise instance + pts_instance_mask (list[torch.Tensor]): point-wise instance label of each batch. - gt_bboxes_ignore (None | list[torch.Tensor]): Specify + gt_bboxes_ignore (list[torch.Tensor]): Specify which bounding. Returns: diff --git a/mmdet3d/models/detectors/imvotenet.py b/mmdet3d/models/detectors/imvotenet.py index 02ab2cd1c3..02297dccee 100644 --- a/mmdet3d/models/detectors/imvotenet.py +++ b/mmdet3d/models/detectors/imvotenet.py @@ -1,7 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. +import warnings + import numpy as np import torch -import warnings from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d from mmdet3d.models.utils import MLP @@ -149,21 +150,21 @@ def __init__(self, if self.with_img_backbone: if img_pretrained is not None: - warnings.warn('DeprecationWarning: pretrained is a deprecated \ - key, please consider using init_cfg') + warnings.warn('DeprecationWarning: pretrained is a deprecated ' + 'key, please consider using init_cfg.') self.img_backbone.init_cfg = dict( type='Pretrained', checkpoint=img_pretrained) if self.with_img_roi_head: if img_pretrained is not None: - warnings.warn('DeprecationWarning: pretrained is a deprecated \ - key, please consider using init_cfg') + warnings.warn('DeprecationWarning: pretrained is a deprecated ' + 'key, please consider using init_cfg.') self.img_roi_head.init_cfg = dict( type='Pretrained', checkpoint=img_pretrained) if self.with_pts_backbone: if img_pretrained is not None: - warnings.warn('DeprecationWarning: pretrained is a deprecated \ - key, please consider using init_cfg') + warnings.warn('DeprecationWarning: pretrained is a deprecated ' + 'key, please consider using init_cfg.') self.pts_backbone.init_cfg = dict( type='Pretrained', checkpoint=pts_pretrained) @@ -393,9 +394,9 @@ def forward_train(self, with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[torch.Tensor]): class indices for each 2d bounding box. - gt_bboxes_ignore (None | list[torch.Tensor]): specify which + gt_bboxes_ignore (list[torch.Tensor]): specify which 2d bounding boxes can be ignored when computing the loss. - gt_masks (None | torch.Tensor): true segmentation masks for each + gt_masks (torch.Tensor): true segmentation masks for each 2d bbox, used if the architecture supports a segmentation task. proposals: override rpn proposals (2d) with custom proposals. Use when `with_rpn` is False. @@ -403,9 +404,9 @@ def forward_train(self, not supported yet. gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): 3d gt bboxes. gt_labels_3d (list[torch.Tensor]): gt class labels for 3d bboxes. - pts_semantic_mask (None | list[torch.Tensor]): point-wise semantic + pts_semantic_mask (list[torch.Tensor]): point-wise semantic label of each batch. - pts_instance_mask (None | list[torch.Tensor]): point-wise instance + pts_instance_mask (list[torch.Tensor]): point-wise instance label of each batch. Returns: diff --git a/mmdet3d/models/detectors/mvx_two_stage.py b/mmdet3d/models/detectors/mvx_two_stage.py index 2534604437..eb2f838bd4 100644 --- a/mmdet3d/models/detectors/mvx_two_stage.py +++ b/mmdet3d/models/detectors/mvx_two_stage.py @@ -1,10 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. +import warnings +from os import path as osp + import mmcv import torch -import warnings from mmcv.parallel import DataContainer as DC from mmcv.runner import force_fp32 -from os import path as osp from torch.nn import functional as F from mmdet3d.core import (Box3DMode, Coord3DMode, bbox3d2result, @@ -84,21 +85,20 @@ def __init__(self, if self.with_img_backbone: if img_pretrained is not None: - warnings.warn('DeprecationWarning: pretrained is a deprecated \ - key, please consider using init_cfg') + warnings.warn('DeprecationWarning: pretrained is a deprecated ' + 'key, please consider using init_cfg.') self.img_backbone.init_cfg = dict( type='Pretrained', checkpoint=img_pretrained) if self.with_img_roi_head: if img_pretrained is not None: - warnings.warn('DeprecationWarning: pretrained is a deprecated \ - key, please consider using init_cfg') + warnings.warn('DeprecationWarning: pretrained is a deprecated ' + 'key, please consider using init_cfg.') self.img_roi_head.init_cfg = dict( type='Pretrained', checkpoint=img_pretrained) - if self.with_pts_backbone: if pts_pretrained is not None: - warnings.warn('DeprecationWarning: pretrained is a deprecated \ - key, please consider using init_cfg') + warnings.warn('DeprecationWarning: pretrained is a deprecated ' + 'key, please consider using init_cfg') self.pts_backbone.init_cfg = dict( type='Pretrained', checkpoint=pts_pretrained) @@ -260,7 +260,7 @@ def forward_train(self, of 2D boxes in images. Defaults to None. gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in images. Defaults to None. - img (torch.Tensor optional): Images of each sample with shape + img (torch.Tensor, optional): Images of each sample with shape (N, C, H, W). Defaults to None. proposals ([list[torch.Tensor], optional): Predicted proposals used for training Fast RCNN. Defaults to None. @@ -497,7 +497,7 @@ def show_results(self, data, result, out_dir): Box3DMode.DEPTH) elif box_mode_3d != Box3DMode.DEPTH: ValueError( - f'Unsupported box_mode_3d {box_mode_3d} for convertion!') + f'Unsupported box_mode_3d {box_mode_3d} for conversion!') pred_bboxes = pred_bboxes.tensor.cpu().numpy() show_result(points, None, pred_bboxes, out_dir, file_name) diff --git a/mmdet3d/models/detectors/point_rcnn.py b/mmdet3d/models/detectors/point_rcnn.py new file mode 100644 index 0000000000..bf972008da --- /dev/null +++ b/mmdet3d/models/detectors/point_rcnn.py @@ -0,0 +1,148 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from mmdet.models import DETECTORS +from .two_stage import TwoStage3DDetector + + +@DETECTORS.register_module() +class PointRCNN(TwoStage3DDetector): + r"""PointRCNN detector. + + Please refer to the `PointRCNN `_ + + Args: + backbone (dict): Config dict of detector's backbone. + neck (dict, optional): Config dict of neck. Defaults to None. + rpn_head (dict, optional): Config of RPN head. Defaults to None. + roi_head (dict, optional): Config of ROI head. Defaults to None. + train_cfg (dict, optional): Train configs. Defaults to None. + test_cfg (dict, optional): Test configs. Defaults to None. + pretrained (str, optional): Model pretrained path. Defaults to None. + init_cfg (dict, optional): Config of initialization. Defaults to None. + """ + + def __init__(self, + backbone, + neck=None, + rpn_head=None, + roi_head=None, + train_cfg=None, + test_cfg=None, + pretrained=None, + init_cfg=None): + super(PointRCNN, self).__init__( + backbone=backbone, + neck=neck, + rpn_head=rpn_head, + roi_head=roi_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + pretrained=pretrained, + init_cfg=init_cfg) + + def extract_feat(self, points): + """Directly extract features from the backbone+neck. + + Args: + points (torch.Tensor): Input points. + + Returns: + dict: Features from the backbone+neck + """ + x = self.backbone(points) + + if self.with_neck: + x = self.neck(x) + return x + + def forward_train(self, points, img_metas, gt_bboxes_3d, gt_labels_3d): + """Forward of training. + + Args: + points (list[torch.Tensor]): Points of each batch. + img_metas (list[dict]): Meta information of each sample. + gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch. + gt_labels_3d (list[torch.Tensor]): gt class labels of each batch. + + Returns: + dict: Losses. + """ + losses = dict() + points_cat = torch.stack(points) + x = self.extract_feat(points_cat) + + # features for rcnn + backbone_feats = x['fp_features'].clone() + backbone_xyz = x['fp_xyz'].clone() + rcnn_feats = {'features': backbone_feats, 'points': backbone_xyz} + + bbox_preds, cls_preds = self.rpn_head(x) + + rpn_loss = self.rpn_head.loss( + bbox_preds=bbox_preds, + cls_preds=cls_preds, + points=points, + gt_bboxes_3d=gt_bboxes_3d, + gt_labels_3d=gt_labels_3d, + img_metas=img_metas) + losses.update(rpn_loss) + + bbox_list = self.rpn_head.get_bboxes(points_cat, bbox_preds, cls_preds, + img_metas) + proposal_list = [ + dict( + boxes_3d=bboxes, + scores_3d=scores, + labels_3d=labels, + cls_preds=preds_cls) + for bboxes, scores, labels, preds_cls in bbox_list + ] + rcnn_feats.update({'points_cls_preds': cls_preds}) + + roi_losses = self.roi_head.forward_train(rcnn_feats, img_metas, + proposal_list, gt_bboxes_3d, + gt_labels_3d) + losses.update(roi_losses) + + return losses + + def simple_test(self, points, img_metas, imgs=None, rescale=False): + """Forward of testing. + + Args: + points (list[torch.Tensor]): Points of each sample. + img_metas (list[dict]): Image metas. + imgs (list[torch.Tensor], optional): Images of each sample. + Defaults to None. + rescale (bool, optional): Whether to rescale results. + Defaults to False. + + Returns: + list: Predicted 3d boxes. + """ + points_cat = torch.stack(points) + + x = self.extract_feat(points_cat) + # features for rcnn + backbone_feats = x['fp_features'].clone() + backbone_xyz = x['fp_xyz'].clone() + rcnn_feats = {'features': backbone_feats, 'points': backbone_xyz} + bbox_preds, cls_preds = self.rpn_head(x) + rcnn_feats.update({'points_cls_preds': cls_preds}) + + bbox_list = self.rpn_head.get_bboxes( + points_cat, bbox_preds, cls_preds, img_metas, rescale=rescale) + + proposal_list = [ + dict( + boxes_3d=bboxes, + scores_3d=scores, + labels_3d=labels, + cls_preds=preds_cls) + for bboxes, scores, labels, preds_cls in bbox_list + ] + bbox_results = self.roi_head.simple_test(rcnn_feats, img_metas, + proposal_list) + + return bbox_results diff --git a/mmdet3d/models/detectors/single_stage_mono3d.py b/mmdet3d/models/detectors/single_stage_mono3d.py index b8d2f4dcd1..46c3e8448f 100644 --- a/mmdet3d/models/detectors/single_stage_mono3d.py +++ b/mmdet3d/models/detectors/single_stage_mono3d.py @@ -1,9 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. +from os import path as osp + import mmcv import numpy as np import torch from mmcv.parallel import DataContainer as DC -from os import path as osp from mmdet3d.core import (CameraInstance3DBoxes, bbox3d2result, show_multi_modality_result) @@ -48,14 +49,15 @@ def forward_train(self, image in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): Class indices corresponding to each box gt_bboxes_3d (list[Tensor]): Each item are the 3D truth boxes for - each image in [x, y, z, w, l, h, theta, vx, vy] format. + each image in [x, y, z, x_size, y_size, z_size, yaw, vx, vy] + format. gt_labels_3d (list[Tensor]): 3D class indices corresponding to each box. centers2d (list[Tensor]): Projected 3D centers onto 2D images. depths (list[Tensor]): Depth of projected centers on 2D images. attr_labels (list[Tensor], optional): Attribute indices corresponding to each box - gt_bboxes_ignore (None | list[Tensor]): Specify which bounding + gt_bboxes_ignore (list[Tensor]): Specify which bounding boxes can be ignored when computing the loss. Returns: diff --git a/mmdet3d/models/detectors/smoke_mono3d.py b/mmdet3d/models/detectors/smoke_mono3d.py new file mode 100644 index 0000000000..852e4849fe --- /dev/null +++ b/mmdet3d/models/detectors/smoke_mono3d.py @@ -0,0 +1,21 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.models.builder import DETECTORS +from .single_stage_mono3d import SingleStageMono3DDetector + + +@DETECTORS.register_module() +class SMOKEMono3D(SingleStageMono3DDetector): + r"""SMOKE `_ for monocular 3D object + detection. + + """ + + def __init__(self, + backbone, + neck, + bbox_head, + train_cfg=None, + test_cfg=None, + pretrained=None): + super(SMOKEMono3D, self).__init__(backbone, neck, bbox_head, train_cfg, + test_cfg, pretrained) diff --git a/mmdet3d/models/detectors/votenet.py b/mmdet3d/models/detectors/votenet.py index c92ff9c942..4ba0caa88f 100644 --- a/mmdet3d/models/detectors/votenet.py +++ b/mmdet3d/models/detectors/votenet.py @@ -40,11 +40,11 @@ def forward_train(self, img_metas (list): Image metas. gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch. gt_labels_3d (list[torch.Tensor]): gt class labels of each batch. - pts_semantic_mask (None | list[torch.Tensor]): point-wise semantic + pts_semantic_mask (list[torch.Tensor]): point-wise semantic label of each batch. - pts_instance_mask (None | list[torch.Tensor]): point-wise instance + pts_instance_mask (list[torch.Tensor]): point-wise instance label of each batch. - gt_bboxes_ignore (None | list[torch.Tensor]): Specify + gt_bboxes_ignore (list[torch.Tensor]): Specify which bounding. Returns: diff --git a/mmdet3d/models/fusion_layers/coord_transform.py b/mmdet3d/models/fusion_layers/coord_transform.py index b3ad7297a0..7cdcac9143 100644 --- a/mmdet3d/models/fusion_layers/coord_transform.py +++ b/mmdet3d/models/fusion_layers/coord_transform.py @@ -1,7 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. -import torch from functools import partial +import torch + from mmdet3d.core.points import get_points_type diff --git a/mmdet3d/models/fusion_layers/point_fusion.py b/mmdet3d/models/fusion_layers/point_fusion.py index 5b587a7d17..97b4177763 100644 --- a/mmdet3d/models/fusion_layers/point_fusion.py +++ b/mmdet3d/models/fusion_layers/point_fusion.py @@ -32,9 +32,9 @@ def point_sample(img_meta, points (torch.Tensor): Nx3 point cloud in LiDAR coordinates. proj_mat (torch.Tensor): 4x4 transformation matrix. coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'. - img_scale_factor (torch.Tensor): Scale factor with shape of \ + img_scale_factor (torch.Tensor): Scale factor with shape of (w_scale, h_scale). - img_crop_offset (torch.Tensor): Crop offset used to crop \ + img_crop_offset (torch.Tensor): Crop offset used to crop image during data augmentation with shape of (w_offset, h_offset). img_flip (bool): Whether the image is flipped. img_pad_shape (tuple[int]): int tuple indicates the h & w after diff --git a/mmdet3d/models/losses/__init__.py b/mmdet3d/models/losses/__init__.py index 7d4703aef9..dcdc69ab6d 100644 --- a/mmdet3d/models/losses/__init__.py +++ b/mmdet3d/models/losses/__init__.py @@ -2,10 +2,13 @@ from mmdet.models.losses import FocalLoss, SmoothL1Loss, binary_cross_entropy from .axis_aligned_iou_loss import AxisAlignedIoULoss, axis_aligned_iou_loss from .chamfer_distance import ChamferDistance, chamfer_distance +from .multibin_loss import MultiBinLoss from .paconv_regularization_loss import PAConvRegularizationLoss +from .uncertain_smooth_l1_loss import UncertainL1Loss, UncertainSmoothL1Loss __all__ = [ 'FocalLoss', 'SmoothL1Loss', 'binary_cross_entropy', 'ChamferDistance', 'chamfer_distance', 'axis_aligned_iou_loss', 'AxisAlignedIoULoss', - 'PAConvRegularizationLoss' + 'PAConvRegularizationLoss', 'UncertainL1Loss', 'UncertainSmoothL1Loss', + 'MultiBinLoss' ] diff --git a/mmdet3d/models/losses/axis_aligned_iou_loss.py b/mmdet3d/models/losses/axis_aligned_iou_loss.py index d0953321dd..5ccef45012 100644 --- a/mmdet3d/models/losses/axis_aligned_iou_loss.py +++ b/mmdet3d/models/losses/axis_aligned_iou_loss.py @@ -54,7 +54,7 @@ def forward(self, Args: pred (torch.Tensor): Bbox predictions with shape [..., 3]. target (torch.Tensor): Bbox targets (gt) with shape [..., 3]. - weight (torch.Tensor|float, optional): Weight of loss. \ + weight (torch.Tensor | float, optional): Weight of loss. Defaults to None. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. diff --git a/mmdet3d/models/losses/chamfer_distance.py b/mmdet3d/models/losses/chamfer_distance.py index a9d4b80247..a0caebad78 100644 --- a/mmdet3d/models/losses/chamfer_distance.py +++ b/mmdet3d/models/losses/chamfer_distance.py @@ -29,13 +29,13 @@ def chamfer_distance(src, Returns: tuple: Source and Destination loss with the corresponding indices. - - loss_src (torch.Tensor): The min distance \ + - loss_src (torch.Tensor): The min distance from source to destination. - - loss_dst (torch.Tensor): The min distance \ + - loss_dst (torch.Tensor): The min distance from destination to source. - - indices1 (torch.Tensor): Index the min distance point \ + - indices1 (torch.Tensor): Index the min distance point for each point in source to destination. - - indices2 (torch.Tensor): Index the min distance point \ + - indices2 (torch.Tensor): Index the min distance point for each point in destination to source. """ @@ -125,10 +125,10 @@ def forward(self, Defaults to False. Returns: - tuple[torch.Tensor]: If ``return_indices=True``, return losses of \ - source and target with their corresponding indices in the \ - order of ``(loss_source, loss_target, indices1, indices2)``. \ - If ``return_indices=False``, return \ + tuple[torch.Tensor]: If ``return_indices=True``, return losses of + source and target with their corresponding indices in the + order of ``(loss_source, loss_target, indices1, indices2)``. + If ``return_indices=False``, return ``(loss_source, loss_target)``. """ assert reduction_override in (None, 'none', 'mean', 'sum') diff --git a/mmdet3d/models/losses/multibin_loss.py b/mmdet3d/models/losses/multibin_loss.py new file mode 100644 index 0000000000..238402fcdc --- /dev/null +++ b/mmdet3d/models/losses/multibin_loss.py @@ -0,0 +1,93 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch import nn as nn +from torch.nn import functional as F + +from mmdet.models.builder import LOSSES +from mmdet.models.losses.utils import weighted_loss + + +@weighted_loss +def multibin_loss(pred_orientations, gt_orientations, num_dir_bins=4): + """Multi-Bin Loss. + + Args: + pred_orientations(torch.Tensor): Predicted local vector + orientation in [axis_cls, head_cls, sin, cos] format. + shape (N, num_dir_bins * 4) + gt_orientations(torch.Tensor): Corresponding gt bboxes, + shape (N, num_dir_bins * 2). + num_dir_bins(int, optional): Number of bins to encode + direction angle. + Defaults: 4. + + Return: + torch.Tensor: Loss tensor. + """ + cls_losses = 0 + reg_losses = 0 + reg_cnt = 0 + for i in range(num_dir_bins): + # bin cls loss + cls_ce_loss = F.cross_entropy( + pred_orientations[:, (i * 2):(i * 2 + 2)], + gt_orientations[:, i].long(), + reduction='mean') + # regression loss + valid_mask_i = (gt_orientations[:, i] == 1) + cls_losses += cls_ce_loss + if valid_mask_i.sum() > 0: + start = num_dir_bins * 2 + i * 2 + end = start + 2 + pred_offset = F.normalize(pred_orientations[valid_mask_i, + start:end]) + gt_offset_sin = torch.sin(gt_orientations[valid_mask_i, + num_dir_bins + i]) + gt_offset_cos = torch.cos(gt_orientations[valid_mask_i, + num_dir_bins + i]) + reg_loss = \ + F.l1_loss(pred_offset[:, 0], gt_offset_sin, + reduction='none') + \ + F.l1_loss(pred_offset[:, 1], gt_offset_cos, + reduction='none') + + reg_losses += reg_loss.sum() + reg_cnt += valid_mask_i.sum() + + return cls_losses / num_dir_bins + reg_losses / reg_cnt + + +@LOSSES.register_module() +class MultiBinLoss(nn.Module): + """Multi-Bin Loss for orientation. + + Args: + reduction (str, optional): The method to reduce the loss. + Options are 'none', 'mean' and 'sum'. Defaults to 'none'. + loss_weight (float, optional): The weight of loss. Defaults + to 1.0. + """ + + def __init__(self, reduction='none', loss_weight=1.0): + super(MultiBinLoss, self).__init__() + assert reduction in ['none', 'sum', 'mean'] + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, pred, target, num_dir_bins, reduction_override=None): + """Forward function. + + Args: + pred (torch.Tensor): The prediction. + target (torch.Tensor): The learning target of the prediction. + num_dir_bins (int): Number of bins to encode direction angle. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Defaults to None. + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + loss = self.loss_weight * multibin_loss( + pred, target, num_dir_bins=num_dir_bins, reduction=reduction) + return loss diff --git a/mmdet3d/models/losses/uncertain_smooth_l1_loss.py b/mmdet3d/models/losses/uncertain_smooth_l1_loss.py new file mode 100644 index 0000000000..d9adccbad3 --- /dev/null +++ b/mmdet3d/models/losses/uncertain_smooth_l1_loss.py @@ -0,0 +1,176 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch import nn as nn + +from mmdet.models.builder import LOSSES +from mmdet.models.losses.utils import weighted_loss + + +@weighted_loss +def uncertain_smooth_l1_loss(pred, target, sigma, alpha=1.0, beta=1.0): + """Smooth L1 loss with uncertainty. + + Args: + pred (torch.Tensor): The prediction. + target (torch.Tensor): The learning target of the prediction. + sigma (torch.Tensor): The sigma for uncertainty. + alpha (float, optional): The coefficient of log(sigma). + Defaults to 1.0. + beta (float, optional): The threshold in the piecewise function. + Defaults to 1.0. + + Returns: + torch.Tensor: Calculated loss + """ + assert beta > 0 + assert target.numel() > 0 + assert pred.size() == target.size() == sigma.size(), 'The size of pred ' \ + f'{pred.size()}, target {target.size()}, and sigma {sigma.size()} ' \ + 'are inconsistent.' + diff = torch.abs(pred - target) + loss = torch.where(diff < beta, 0.5 * diff * diff / beta, + diff - 0.5 * beta) + loss = torch.exp(-sigma) * loss + alpha * sigma + + return loss + + +@weighted_loss +def uncertain_l1_loss(pred, target, sigma, alpha=1.0): + """L1 loss with uncertainty. + + Args: + pred (torch.Tensor): The prediction. + target (torch.Tensor): The learning target of the prediction. + sigma (torch.Tensor): The sigma for uncertainty. + alpha (float, optional): The coefficient of log(sigma). + Defaults to 1.0. + + Returns: + torch.Tensor: Calculated loss + """ + assert target.numel() > 0 + assert pred.size() == target.size() == sigma.size(), 'The size of pred ' \ + f'{pred.size()}, target {target.size()}, and sigma {sigma.size()} ' \ + 'are inconsistent.' + loss = torch.abs(pred - target) + loss = torch.exp(-sigma) * loss + alpha * sigma + return loss + + +@LOSSES.register_module() +class UncertainSmoothL1Loss(nn.Module): + r"""Smooth L1 loss with uncertainty. + + Please refer to `PGD `_ and + `Multi-Task Learning Using Uncertainty to Weigh Losses for Scene Geometry + and Semantics `_ for more details. + + Args: + alpha (float, optional): The coefficient of log(sigma). + Defaults to 1.0. + beta (float, optional): The threshold in the piecewise function. + Defaults to 1.0. + reduction (str, optional): The method to reduce the loss. + Options are 'none', 'mean' and 'sum'. Defaults to 'mean'. + loss_weight (float, optional): The weight of loss. Defaults to 1.0 + """ + + def __init__(self, alpha=1.0, beta=1.0, reduction='mean', loss_weight=1.0): + super(UncertainSmoothL1Loss, self).__init__() + assert reduction in ['none', 'sum', 'mean'] + self.alpha = alpha + self.beta = beta + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, + pred, + target, + sigma, + weight=None, + avg_factor=None, + reduction_override=None, + **kwargs): + """Forward function. + + Args: + pred (torch.Tensor): The prediction. + target (torch.Tensor): The learning target of the prediction. + sigma (torch.Tensor): The sigma for uncertainty. + weight (torch.Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Defaults to None. + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + loss_bbox = self.loss_weight * uncertain_smooth_l1_loss( + pred, + target, + weight, + sigma=sigma, + alpha=self.alpha, + beta=self.beta, + reduction=reduction, + avg_factor=avg_factor, + **kwargs) + return loss_bbox + + +@LOSSES.register_module() +class UncertainL1Loss(nn.Module): + """L1 loss with uncertainty. + + Args: + alpha (float, optional): The coefficient of log(sigma). + Defaults to 1.0. + reduction (str, optional): The method to reduce the loss. + Options are 'none', 'mean' and 'sum'. Defaults to 'mean'. + loss_weight (float, optional): The weight of loss. Defaults to 1.0. + """ + + def __init__(self, alpha=1.0, reduction='mean', loss_weight=1.0): + super(UncertainL1Loss, self).__init__() + assert reduction in ['none', 'sum', 'mean'] + self.alpha = alpha + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, + pred, + target, + sigma, + weight=None, + avg_factor=None, + reduction_override=None): + """Forward function. + + Args: + pred (torch.Tensor): The prediction. + target (torch.Tensor): The learning target of the prediction. + sigma (torch.Tensor): The sigma for uncertainty. + weight (torch.Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Defaults to None. + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + loss_bbox = self.loss_weight * uncertain_l1_loss( + pred, + target, + weight, + sigma=sigma, + alpha=self.alpha, + reduction=reduction, + avg_factor=avg_factor) + return loss_bbox diff --git a/mmdet3d/models/middle_encoders/sparse_encoder.py b/mmdet3d/models/middle_encoders/sparse_encoder.py index 1a03e9b5c9..8b296c1a47 100644 --- a/mmdet3d/models/middle_encoders/sparse_encoder.py +++ b/mmdet3d/models/middle_encoders/sparse_encoder.py @@ -14,19 +14,21 @@ class SparseEncoder(nn.Module): Args: in_channels (int): The number of input channels. sparse_shape (list[int]): The sparse shape of input tensor. - order (list[str]): Order of conv module. Defaults to ('conv', - 'norm', 'act'). - norm_cfg (dict): Config of normalization layer. Defaults to + order (list[str], optional): Order of conv module. + Defaults to ('conv', 'norm', 'act'). + norm_cfg (dict, optional): Config of normalization layer. Defaults to dict(type='BN1d', eps=1e-3, momentum=0.01). - base_channels (int): Out channels for conv_input layer. + base_channels (int, optional): Out channels for conv_input layer. Defaults to 16. - output_channels (int): Out channels for conv_out layer. + output_channels (int, optional): Out channels for conv_out layer. Defaults to 128. - encoder_channels (tuple[tuple[int]]): + encoder_channels (tuple[tuple[int]], optional): Convolutional channels of each encode block. - encoder_paddings (tuple[tuple[int]]): Paddings of each encode block. + encoder_paddings (tuple[tuple[int]], optional): + Paddings of each encode block. Defaults to ((16, ), (32, 32, 32), (64, 64, 64), (64, 64, 64)). - block_type (str): Type of the block to use. Defaults to 'conv_module'. + block_type (str, optional): Type of the block to use. + Defaults to 'conv_module'. """ def __init__(self, @@ -99,7 +101,7 @@ def forward(self, voxel_features, coors, batch_size): Args: voxel_features (torch.float32): Voxel features in shape (N, C). - coors (torch.int32): Coordinates in shape (N, 4), \ + coors (torch.int32): Coordinates in shape (N, 4), the columns in the order of (batch_idx, z_idx, y_idx, x_idx). batch_size (int): Batch size. @@ -139,9 +141,9 @@ def make_encoder_layers(self, make_block (method): A bounded function to build blocks. norm_cfg (dict[str]): Config of normalization layer. in_channels (int): The number of encoder input channels. - block_type (str): Type of the block to use. Defaults to - 'conv_module'. - conv_cfg (dict): Config of conv layer. Defaults to + block_type (str, optional): Type of the block to use. + Defaults to 'conv_module'. + conv_cfg (dict, optional): Config of conv layer. Defaults to dict(type='SubMConv3d'). Returns: diff --git a/mmdet3d/models/model_utils/__init__.py b/mmdet3d/models/model_utils/__init__.py index 83d4c4a8f6..34df79a22d 100644 --- a/mmdet3d/models/model_utils/__init__.py +++ b/mmdet3d/models/model_utils/__init__.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. +from .edge_fusion_module import EdgeFusionModule from .transformer import GroupFree3DMHA from .vote_module import VoteModule -__all__ = ['VoteModule', 'GroupFree3DMHA'] +__all__ = ['VoteModule', 'GroupFree3DMHA', 'EdgeFusionModule'] diff --git a/mmdet3d/models/model_utils/edge_fusion_module.py b/mmdet3d/models/model_utils/edge_fusion_module.py new file mode 100644 index 0000000000..2d9e09ee2b --- /dev/null +++ b/mmdet3d/models/model_utils/edge_fusion_module.py @@ -0,0 +1,78 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.cnn import ConvModule +from mmcv.runner import BaseModule +from torch import nn as nn +from torch.nn import functional as F + + +class EdgeFusionModule(BaseModule): + """Edge Fusion Module for feature map. + + Args: + out_channels (int): The number of output channels. + feat_channels (int): The number of channels in feature map + during edge feature fusion. + kernel_size (int, optional): Kernel size of convolution. + Default: 3. + act_cfg (dict, optional): Config of activation. + Default: dict(type='ReLU'). + norm_cfg (dict, optional): Config of normalization. + Default: dict(type='BN1d')). + """ + + def __init__(self, + out_channels, + feat_channels, + kernel_size=3, + act_cfg=dict(type='ReLU'), + norm_cfg=dict(type='BN1d')): + super().__init__() + self.edge_convs = nn.Sequential( + ConvModule( + feat_channels, + feat_channels, + kernel_size=kernel_size, + padding=kernel_size // 2, + conv_cfg=dict(type='Conv1d'), + norm_cfg=norm_cfg, + act_cfg=act_cfg), + nn.Conv1d(feat_channels, out_channels, kernel_size=1)) + self.feat_channels = feat_channels + + def forward(self, features, fused_features, edge_indices, edge_lens, + output_h, output_w): + """Forward pass. + + Args: + features (torch.Tensor): Different representative features + for fusion. + fused_features (torch.Tensor): Different representative + features to be fused. + edge_indices (torch.Tensor): Batch image edge indices. + edge_lens (list[int]): List of edge length of each image. + output_h (int): Height of output feature map. + output_w (int): Width of output feature map. + + Returns: + torch.Tensor: Fused feature maps. + """ + batch_size = features.shape[0] + # normalize + grid_edge_indices = edge_indices.view(batch_size, -1, 1, 2).float() + grid_edge_indices[..., 0] = \ + grid_edge_indices[..., 0] / (output_w - 1) * 2 - 1 + grid_edge_indices[..., 1] = \ + grid_edge_indices[..., 1] / (output_h - 1) * 2 - 1 + + # apply edge fusion + edge_features = F.grid_sample( + features, grid_edge_indices, align_corners=True).squeeze(-1) + edge_output = self.edge_convs(edge_features) + + for k in range(batch_size): + edge_indice_k = edge_indices[k, :edge_lens[k]] + fused_features[k, :, edge_indice_k[:, 1], + edge_indice_k[:, 0]] += edge_output[ + k, :, :edge_lens[k]] + + return fused_features diff --git a/mmdet3d/models/model_utils/transformer.py b/mmdet3d/models/model_utils/transformer.py index 4a1af93dd8..4f9a833e1f 100644 --- a/mmdet3d/models/model_utils/transformer.py +++ b/mmdet3d/models/model_utils/transformer.py @@ -15,15 +15,16 @@ class GroupFree3DMHA(MultiheadAttention): embed_dims (int): The embedding dimension. num_heads (int): Parallel attention heads. Same as `nn.MultiheadAttention`. - attn_drop (float): A Dropout layer on attn_output_weights. Default 0.0. - proj_drop (float): A Dropout layer. Default 0.0. - dropout_layer (obj:`ConfigDict`): The dropout_layer used + attn_drop (float, optional): A Dropout layer on attn_output_weights. + Defaults to 0.0. + proj_drop (float, optional): A Dropout layer. Defaults to 0.0. + dropout_layer (obj:`ConfigDict`, optional): The dropout_layer used when adding the shortcut. - init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. - Default: None. - batch_first (bool): Key, Query and Value are shape of + init_cfg (obj:`mmcv.ConfigDict`, optional): The Config for + initialization. Default: None. + batch_first (bool, optional): Key, Query and Value are shape of (batch, n, embed_dim) - or (n, batch, embed_dim). Default to False. + or (n, batch, embed_dim). Defaults to False. """ def __init__(self, @@ -58,26 +59,26 @@ def forward(self, embed_dims]. Same in `nn.MultiheadAttention.forward`. key (Tensor): The key tensor with shape [num_keys, bs, embed_dims]. Same in `nn.MultiheadAttention.forward`. - If None, the ``query`` will be used. Defaults to None. + If None, the ``query`` will be used. value (Tensor): The value tensor with same shape as `key`. - Same in `nn.MultiheadAttention.forward`. Defaults to None. + Same in `nn.MultiheadAttention.forward`. If None, the `key` will be used. identity (Tensor): This tensor, with the same shape as x, - will be used for the identity link. - If None, `x` will be used. Defaults to None. - query_pos (Tensor): The positional encoding for query, with - the same shape as `x`. If not None, it will - be added to `x` before forward function. Defaults to None. - key_pos (Tensor): The positional encoding for `key`, with the - same shape as `key`. Defaults to None. If not None, it will - be added to `key` before forward function. If None, and - `query_pos` has the same shape as `key`, then `query_pos` + will be used for the identity link. If None, `x` will be used. + query_pos (Tensor, optional): The positional encoding for query, + with the same shape as `x`. Defaults to None. + If not None, it will be added to `x` before forward function. + key_pos (Tensor, optional): The positional encoding for `key`, + with the same shape as `key`. Defaults to None. If not None, + it will be added to `key` before forward function. If None, + and `query_pos` has the same shape as `key`, then `query_pos` will be used for `key_pos`. Defaults to None. - attn_mask (Tensor): ByteTensor mask with shape [num_queries, - num_keys]. Same in `nn.MultiheadAttention.forward`. - Defaults to None. - key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys]. + attn_mask (Tensor, optional): ByteTensor mask with shape + [num_queries, num_keys]. Same in `nn.MultiheadAttention.forward`. Defaults to None. + key_padding_mask (Tensor, optional): ByteTensor with shape + [bs, num_keys]. Same in `nn.MultiheadAttention.forward`. + Defaults to None. Returns: Tensor: forwarded results with shape [num_queries, bs, embed_dims]. @@ -113,7 +114,7 @@ class ConvBNPositionalEncoding(nn.Module): Args: input_channel (int): input features dim. - num_pos_feats (int): output position features dim. + num_pos_feats (int, optional): output position features dim. Defaults to 288 to be consistent with seed features dim. """ @@ -131,7 +132,7 @@ def forward(self, xyz): xyz (Tensor): (B, N, 3) the coordinates to embed. Returns: - Tensor: (B, num_pos_feats, N) the embeded position features. + Tensor: (B, num_pos_feats, N) the embedded position features. """ xyz = xyz.permute(0, 2, 1) position_embedding = self.position_embedding_head(xyz) diff --git a/mmdet3d/models/model_utils/vote_module.py b/mmdet3d/models/model_utils/vote_module.py index 75ca537ccb..5cc52ad9d4 100644 --- a/mmdet3d/models/model_utils/vote_module.py +++ b/mmdet3d/models/model_utils/vote_module.py @@ -14,22 +14,25 @@ class VoteModule(nn.Module): Args: in_channels (int): Number of channels of seed point features. - vote_per_seed (int): Number of votes generated from each seed point. - gt_per_seed (int): Number of ground truth votes generated - from each seed point. - num_points (int): Number of points to be used for voting. - conv_channels (tuple[int]): Out channels of vote - generating convolution. - conv_cfg (dict): Config of convolution. + vote_per_seed (int, optional): Number of votes generated from + each seed point. Default: 1. + gt_per_seed (int, optional): Number of ground truth votes generated + from each seed point. Default: 3. + num_points (int, optional): Number of points to be used for voting. + Default: 1. + conv_channels (tuple[int], optional): Out channels of vote + generating convolution. Default: (16, 16). + conv_cfg (dict, optional): Config of convolution. Default: dict(type='Conv1d'). - norm_cfg (dict): Config of normalization. + norm_cfg (dict, optional): Config of normalization. Default: dict(type='BN1d'). - norm_feats (bool): Whether to normalize features. + norm_feats (bool, optional): Whether to normalize features. Default: True. - with_res_feat (bool): Whether to predict residual features. + with_res_feat (bool, optional): Whether to predict residual features. Default: True. - vote_xyz_range (list[float], None): The range of points translation. - vote_loss (dict): Config of vote loss. + vote_xyz_range (list[float], optional): + The range of points translation. Default: None. + vote_loss (dict, optional): Config of vote loss. Default: None. """ def __init__(self, @@ -95,10 +98,10 @@ def forward(self, seed_points, seed_feats): Returns: tuple[torch.Tensor]: - - vote_points: Voted xyz based on the seed points \ + - vote_points: Voted xyz based on the seed points with shape (B, M, 3), ``M=num_seed*vote_per_seed``. - - vote_features: Voted features based on the seed points with \ - shape (B, C, M) where ``M=num_seed*vote_per_seed``, \ + - vote_features: Voted features based on the seed points with + shape (B, C, M) where ``M=num_seed*vote_per_seed``, ``C=vote_feature_dim``. """ if self.num_points != -1: diff --git a/mmdet3d/models/necks/__init__.py b/mmdet3d/models/necks/__init__.py index 9752a8b490..5443d357d7 100644 --- a/mmdet3d/models/necks/__init__.py +++ b/mmdet3d/models/necks/__init__.py @@ -1,6 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. from mmdet.models.necks.fpn import FPN +from .dla_neck import DLANeck from .imvoxel_neck import OutdoorImVoxelNeck +from .pointnet2_fp_neck import PointNetFPNeck from .second_fpn import SECONDFPN -__all__ = ['FPN', 'SECONDFPN', 'OutdoorImVoxelNeck'] +__all__ = [ + 'FPN', 'SECONDFPN', 'OutdoorImVoxelNeck', 'PointNetFPNeck', 'DLANeck' +] diff --git a/mmdet3d/models/necks/dla_neck.py b/mmdet3d/models/necks/dla_neck.py new file mode 100644 index 0000000000..d049aede65 --- /dev/null +++ b/mmdet3d/models/necks/dla_neck.py @@ -0,0 +1,233 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import numpy as np +from mmcv.cnn import ConvModule, build_conv_layer +from mmcv.runner import BaseModule +from torch import nn as nn + +from mmdet.models.builder import NECKS + + +def fill_up_weights(up): + """Simulated bilinear upsampling kernel. + + Args: + up (nn.Module): ConvTranspose2d module. + """ + w = up.weight.data + f = math.ceil(w.size(2) / 2) + c = (2 * f - 1 - f % 2) / (2. * f) + for i in range(w.size(2)): + for j in range(w.size(3)): + w[0, 0, i, j] = \ + (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c)) + for c in range(1, w.size(0)): + w[c, 0, :, :] = w[0, 0, :, :] + + +class IDAUpsample(BaseModule): + """Iterative Deep Aggregation (IDA) Upsampling module to upsample features + of different scales to a similar scale. + + Args: + out_channels (int): Number of output channels for DeformConv. + in_channels (List[int]): List of input channels of multi-scale + feature maps. + kernel_sizes (List[int]): List of size of the convolving + kernel of different scales. + norm_cfg (dict, optional): Config dict for normalization layer. + Default: None. + use_dcn (bool, optional): If True, use DCNv2. Default: True. + """ + + def __init__( + self, + out_channels, + in_channels, + kernel_sizes, + norm_cfg=None, + use_dcn=True, + init_cfg=None, + ): + super(IDAUpsample, self).__init__(init_cfg) + self.use_dcn = use_dcn + self.projs = nn.ModuleList() + self.ups = nn.ModuleList() + self.nodes = nn.ModuleList() + + for i in range(1, len(in_channels)): + in_channel = in_channels[i] + up_kernel_size = int(kernel_sizes[i]) + proj = ConvModule( + in_channel, + out_channels, + 3, + padding=1, + bias=True, + conv_cfg=dict(type='DCNv2') if self.use_dcn else None, + norm_cfg=norm_cfg) + node = ConvModule( + out_channels, + out_channels, + 3, + padding=1, + bias=True, + conv_cfg=dict(type='DCNv2') if self.use_dcn else None, + norm_cfg=norm_cfg) + up = build_conv_layer( + dict(type='deconv'), + out_channels, + out_channels, + up_kernel_size * 2, + stride=up_kernel_size, + padding=up_kernel_size // 2, + output_padding=0, + groups=out_channels, + bias=False) + + self.projs.append(proj) + self.ups.append(up) + self.nodes.append(node) + + def forward(self, mlvl_features, start_level, end_level): + """Forward function. + + Args: + mlvl_features (list[torch.Tensor]): Features from multiple layers. + start_level (int): Start layer for feature upsampling. + end_level (int): End layer for feature upsampling. + """ + for i in range(start_level, end_level - 1): + upsample = self.ups[i - start_level] + project = self.projs[i - start_level] + mlvl_features[i + 1] = upsample(project(mlvl_features[i + 1])) + node = self.nodes[i - start_level] + mlvl_features[i + 1] = node(mlvl_features[i + 1] + + mlvl_features[i]) + + +class DLAUpsample(BaseModule): + """Deep Layer Aggregation (DLA) Upsampling module for different scales + feature extraction, upsampling and fusion, It consists of groups of + IDAupsample modules. + + Args: + start_level (int): The start layer. + channels (List[int]): List of input channels of multi-scale + feature maps. + scales(List[int]): List of scale of different layers' feature. + in_channels (NoneType, optional): List of input channels of + different scales. Default: None. + norm_cfg (dict, optional): Config dict for normalization layer. + Default: None. + use_dcn (bool, optional): Whether to use dcn in IDAup module. + Default: True. + """ + + def __init__(self, + start_level, + channels, + scales, + in_channels=None, + norm_cfg=None, + use_dcn=True, + init_cfg=None): + super(DLAUpsample, self).__init__(init_cfg) + self.start_level = start_level + if in_channels is None: + in_channels = channels + self.channels = channels + channels = list(channels) + scales = np.array(scales, dtype=int) + for i in range(len(channels) - 1): + j = -i - 2 + setattr( + self, 'ida_{}'.format(i), + IDAUpsample(channels[j], in_channels[j:], + scales[j:] // scales[j], norm_cfg, use_dcn)) + scales[j + 1:] = scales[j] + in_channels[j + 1:] = [channels[j] for _ in channels[j + 1:]] + + def forward(self, mlvl_features): + """Forward function. + + Args: + mlvl_features(list[torch.Tensor]): Features from multi-scale + layers. + + Returns: + tuple[torch.Tensor]: Up-sampled features of different layers. + """ + outs = [mlvl_features[-1]] + for i in range(len(mlvl_features) - self.start_level - 1): + ida = getattr(self, 'ida_{}'.format(i)) + ida(mlvl_features, len(mlvl_features) - i - 2, len(mlvl_features)) + outs.insert(0, mlvl_features[-1]) + return outs + + +@NECKS.register_module() +class DLANeck(BaseModule): + """DLA Neck. + + Args: + in_channels (list[int], optional): List of input channels + of multi-scale feature map. + start_level (int, optional): The scale level where upsampling + starts. Default: 2. + end_level (int, optional): The scale level where upsampling + ends. Default: 5. + norm_cfg (dict, optional): Config dict for normalization + layer. Default: None. + use_dcn (bool, optional): Whether to use dcn in IDAup module. + Default: True. + """ + + def __init__(self, + in_channels=[16, 32, 64, 128, 256, 512], + start_level=2, + end_level=5, + norm_cfg=None, + use_dcn=True, + init_cfg=None): + super(DLANeck, self).__init__(init_cfg) + self.start_level = start_level + self.end_level = end_level + scales = [2**i for i in range(len(in_channels[self.start_level:]))] + self.dla_up = DLAUpsample( + start_level=self.start_level, + channels=in_channels[self.start_level:], + scales=scales, + norm_cfg=norm_cfg, + use_dcn=use_dcn) + self.ida_up = IDAUpsample( + in_channels[self.start_level], + in_channels[self.start_level:self.end_level], + [2**i for i in range(self.end_level - self.start_level)], norm_cfg, + use_dcn) + + def forward(self, x): + mlvl_features = [x[i] for i in range(len(x))] + mlvl_features = self.dla_up(mlvl_features) + outs = [] + for i in range(self.end_level - self.start_level): + outs.append(mlvl_features[i].clone()) + self.ida_up(outs, 0, len(outs)) + return [outs[-1]] + + def init_weights(self): + for m in self.modules(): + if isinstance(m, nn.ConvTranspose2d): + # In order to be consistent with the source code, + # reset the ConvTranspose2d initialization parameters + m.reset_parameters() + # Simulated bilinear upsampling kernel + fill_up_weights(m) + elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Conv2d): + # In order to be consistent with the source code, + # reset the Conv2d initialization parameters + m.reset_parameters() diff --git a/mmdet3d/models/necks/pointnet2_fp_neck.py b/mmdet3d/models/necks/pointnet2_fp_neck.py new file mode 100644 index 0000000000..8734848ef8 --- /dev/null +++ b/mmdet3d/models/necks/pointnet2_fp_neck.py @@ -0,0 +1,89 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.runner import BaseModule +from torch import nn as nn + +from mmdet3d.ops import PointFPModule +from mmdet.models import NECKS + + +@NECKS.register_module() +class PointNetFPNeck(BaseModule): + r"""PointNet FP Module used in PointRCNN. + + Refer to the `official code `_. + + .. code-block:: none + + sa_n ---------------------------------------- + | + ... --------------------------------- | + | | + sa_1 ------------- | | + | | | + sa_0 -> fp_0 -> fp_module ->fp_1 -> ... -> fp_module -> fp_n + + sa_n including sa_xyz (torch.Tensor) and sa_features (torch.Tensor) + fp_n including fp_xyz (torch.Tensor) and fp_features (torch.Tensor) + + Args: + fp_channels (tuple[tuple[int]]): Tuple of mlp channels in FP modules. + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None + """ + + def __init__(self, fp_channels, init_cfg=None): + super(PointNetFPNeck, self).__init__(init_cfg=init_cfg) + + self.num_fp = len(fp_channels) + self.FP_modules = nn.ModuleList() + for cur_fp_mlps in fp_channels: + self.FP_modules.append(PointFPModule(mlp_channels=cur_fp_mlps)) + + def _extract_input(self, feat_dict): + """Extract inputs from features dictionary. + + Args: + feat_dict (dict): Feature dict from backbone, which may contain + the following keys and values: + + - sa_xyz (list[torch.Tensor]): Points of each sa module + in shape (N, 3). + - sa_features (list[torch.Tensor]): Output features of + each sa module in shape (N, M). + + Returns: + list[torch.Tensor]: Coordinates of multiple levels of points. + list[torch.Tensor]: Features of multiple levels of points. + """ + sa_xyz = feat_dict['sa_xyz'] + sa_features = feat_dict['sa_features'] + assert len(sa_xyz) == len(sa_features) + + return sa_xyz, sa_features + + def forward(self, feat_dict): + """Forward pass. + + Args: + feat_dict (dict): Feature dict from backbone. + + Returns: + dict[str, torch.Tensor]: Outputs of the Neck. + + - fp_xyz (torch.Tensor): The coordinates of fp features. + - fp_features (torch.Tensor): The features from the last + feature propagation layers. + """ + sa_xyz, sa_features = self._extract_input(feat_dict) + + fp_feature = sa_features[-1] + fp_xyz = sa_xyz[-1] + + for i in range(self.num_fp): + # consume the points in a bottom-up manner + fp_feature = self.FP_modules[i](sa_xyz[-(i + 2)], sa_xyz[-(i + 1)], + sa_features[-(i + 2)], fp_feature) + fp_xyz = sa_xyz[-(i + 2)] + + ret = dict(fp_xyz=fp_xyz, fp_features=fp_feature) + return ret diff --git a/mmdet3d/models/roi_heads/__init__.py b/mmdet3d/models/roi_heads/__init__.py index 509c9ccb61..e607570d71 100644 --- a/mmdet3d/models/roi_heads/__init__.py +++ b/mmdet3d/models/roi_heads/__init__.py @@ -4,10 +4,11 @@ from .h3d_roi_head import H3DRoIHead from .mask_heads import PointwiseSemanticHead, PrimitiveHead from .part_aggregation_roi_head import PartAggregationROIHead +from .point_rcnn_roi_head import PointRCNNRoIHead from .roi_extractors import Single3DRoIAwareExtractor, SingleRoIExtractor __all__ = [ 'Base3DRoIHead', 'PartAggregationROIHead', 'PointwiseSemanticHead', 'Single3DRoIAwareExtractor', 'PartA2BboxHead', 'SingleRoIExtractor', - 'H3DRoIHead', 'PrimitiveHead' + 'H3DRoIHead', 'PrimitiveHead', 'PointRCNNRoIHead' ] diff --git a/mmdet3d/models/roi_heads/base_3droi_head.py b/mmdet3d/models/roi_heads/base_3droi_head.py index b19b5a96b2..e1816ff6bf 100644 --- a/mmdet3d/models/roi_heads/base_3droi_head.py +++ b/mmdet3d/models/roi_heads/base_3droi_head.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from abc import ABCMeta, abstractmethod + from mmcv.runner import BaseModule diff --git a/mmdet3d/models/roi_heads/bbox_heads/__init__.py b/mmdet3d/models/roi_heads/bbox_heads/__init__.py index 6294f52f4c..fd7a6b04ae 100644 --- a/mmdet3d/models/roi_heads/bbox_heads/__init__.py +++ b/mmdet3d/models/roi_heads/bbox_heads/__init__.py @@ -5,9 +5,10 @@ Shared4Conv1FCBBoxHead) from .h3d_bbox_head import H3DBboxHead from .parta2_bbox_head import PartA2BboxHead +from .point_rcnn_bbox_head import PointRCNNBboxHead __all__ = [ 'BBoxHead', 'ConvFCBBoxHead', 'Shared2FCBBoxHead', 'Shared4Conv1FCBBoxHead', 'DoubleConvFCBBoxHead', 'PartA2BboxHead', - 'H3DBboxHead' + 'H3DBboxHead', 'PointRCNNBboxHead' ] diff --git a/mmdet3d/models/roi_heads/bbox_heads/h3d_bbox_head.py b/mmdet3d/models/roi_heads/bbox_heads/h3d_bbox_head.py index 5126aa598b..033dd468c7 100644 --- a/mmdet3d/models/roi_heads/bbox_heads/h3d_bbox_head.py +++ b/mmdet3d/models/roi_heads/bbox_heads/h3d_bbox_head.py @@ -20,7 +20,7 @@ class H3DBboxHead(BaseModule): Args: num_classes (int): The number of classes. - suface_matching_cfg (dict): Config for suface primitive matching. + surface_matching_cfg (dict): Config for surface primitive matching. line_matching_cfg (dict): Config for line primitive matching. bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for encoding and decoding boxes. @@ -36,7 +36,7 @@ class H3DBboxHead(BaseModule): primitive_refine_channels (tuple[int]): Convolution channels of prediction layer. upper_thresh (float): Threshold for line matching. - surface_thresh (float): Threshold for suface matching. + surface_thresh (float): Threshold for surface matching. line_thresh (float): Threshold for line matching. conv_cfg (dict): Config of convolution in prediction layer. norm_cfg (dict): Config of BN in prediction layer. @@ -324,16 +324,16 @@ def loss(self, Args: bbox_preds (dict): Predictions from forward of h3d bbox head. points (list[torch.Tensor]): Input points. - gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \ + gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth bboxes of each sample. gt_labels_3d (list[torch.Tensor]): Labels of each sample. - pts_semantic_mask (None | list[torch.Tensor]): Point-wise + pts_semantic_mask (list[torch.Tensor]): Point-wise semantic mask. - pts_instance_mask (None | list[torch.Tensor]): Point-wise + pts_instance_mask (list[torch.Tensor]): Point-wise instance mask. img_metas (list[dict]): Contain pcd and img's meta info. rpn_targets (Tuple) : Targets generated by rpn head. - gt_bboxes_ignore (None | list[torch.Tensor]): Specify + gt_bboxes_ignore (list[torch.Tensor]): Specify which bounding. Returns: @@ -502,7 +502,7 @@ def multiclass_nms_single(self, obj_scores, sem_scores, bbox, points, box_dim=bbox.shape[-1], with_yaw=self.bbox_coder.with_rot, origin=(0.5, 0.5, 0.5)) - box_indices = bbox.points_in_boxes(points) + box_indices = bbox.points_in_boxes_all(points) corner3d = bbox.corners minmax_box3d = corner3d.new(torch.Size((corner3d.shape[0], 6))) @@ -560,25 +560,25 @@ def get_proposal_stage_loss(self, Args: bbox_preds (dict): Predictions from forward of vote head. - size_class_targets (torch.Tensor): Ground truth \ + size_class_targets (torch.Tensor): Ground truth size class of each prediction bounding box. - size_res_targets (torch.Tensor): Ground truth \ + size_res_targets (torch.Tensor): Ground truth size residual of each prediction bounding box. - dir_class_targets (torch.Tensor): Ground truth \ + dir_class_targets (torch.Tensor): Ground truth direction class of each prediction bounding box. - dir_res_targets (torch.Tensor): Ground truth \ + dir_res_targets (torch.Tensor): Ground truth direction residual of each prediction bounding box. - center_targets (torch.Tensor): Ground truth center \ + center_targets (torch.Tensor): Ground truth center of each prediction bounding box. - mask_targets (torch.Tensor): Validation of each \ + mask_targets (torch.Tensor): Validation of each prediction bounding box. - objectness_targets (torch.Tensor): Ground truth \ + objectness_targets (torch.Tensor): Ground truth objectness label of each prediction bounding box. - objectness_weights (torch.Tensor): Weights of objectness \ + objectness_weights (torch.Tensor): Weights of objectness loss for each prediction bounding box. - box_loss_weights (torch.Tensor): Weights of regression \ + box_loss_weights (torch.Tensor): Weights of regression loss for each prediction bounding box. - valid_gt_weights (torch.Tensor): Validation of each \ + valid_gt_weights (torch.Tensor): Validation of each ground truth bounding box. Returns: @@ -663,12 +663,12 @@ def get_targets(self, Args: points (list[torch.Tensor]): Points of each batch. - gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \ + gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth bboxes of each batch. gt_labels_3d (list[torch.Tensor]): Labels of each batch. - pts_semantic_mask (None | list[torch.Tensor]): Point-wise semantic + pts_semantic_mask (list[torch.Tensor]): Point-wise semantic label of each batch. - pts_instance_mask (None | list[torch.Tensor]): Point-wise instance + pts_instance_mask (list[torch.Tensor]): Point-wise instance label of each batch. bbox_preds (torch.Tensor): Bounding box predictions of vote head. @@ -769,22 +769,22 @@ def get_targets_single(self, Args: points (torch.Tensor): Points of each batch. - gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth \ + gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth boxes of each batch. gt_labels_3d (torch.Tensor): Labels of each batch. - pts_semantic_mask (None | torch.Tensor): Point-wise semantic + pts_semantic_mask (torch.Tensor): Point-wise semantic label of each batch. - pts_instance_mask (None | torch.Tensor): Point-wise instance + pts_instance_mask (torch.Tensor): Point-wise instance label of each batch. aggregated_points (torch.Tensor): Aggregated points from vote aggregation layer. pred_surface_center (torch.Tensor): Prediction of surface center. pred_line_center (torch.Tensor): Prediction of line center. - pred_obj_surface_center (torch.Tensor): Objectness prediction \ + pred_obj_surface_center (torch.Tensor): Objectness prediction of surface center. - pred_obj_line_center (torch.Tensor): Objectness prediction of \ + pred_obj_line_center (torch.Tensor): Objectness prediction of line center. - pred_surface_sem (torch.Tensor): Semantic prediction of \ + pred_surface_sem (torch.Tensor): Semantic prediction of surface center. pred_line_sem (torch.Tensor): Semantic prediction of line center. Returns: diff --git a/mmdet3d/models/roi_heads/bbox_heads/parta2_bbox_head.py b/mmdet3d/models/roi_heads/bbox_heads/parta2_bbox_head.py index bc2c9ea8c4..e6e4e2b77b 100644 --- a/mmdet3d/models/roi_heads/bbox_heads/parta2_bbox_head.py +++ b/mmdet3d/models/roi_heads/bbox_heads/parta2_bbox_head.py @@ -285,7 +285,7 @@ def forward(self, seg_feats, part_feats): def loss(self, cls_score, bbox_pred, rois, labels, bbox_targets, pos_gt_bboxes, reg_mask, label_weights, bbox_weights): - """Coumputing losses. + """Computing losses. Args: cls_score (torch.Tensor): Scores of each roi. @@ -344,7 +344,7 @@ def loss(self, cls_score, bbox_pred, rois, labels, bbox_targets, pred_boxes3d[..., 0:3] = rotation_3d_in_axis( pred_boxes3d[..., 0:3].unsqueeze(1), - (pos_rois_rotation + np.pi / 2), + pos_rois_rotation, axis=2).squeeze(1) pred_boxes3d[:, 0:3] += roi_xyz @@ -436,8 +436,7 @@ def _get_target_single(self, pos_bboxes, pos_gt_bboxes, ious, cfg): pos_gt_bboxes_ct[..., 0:3] -= roi_center pos_gt_bboxes_ct[..., 6] -= roi_ry pos_gt_bboxes_ct[..., 0:3] = rotation_3d_in_axis( - pos_gt_bboxes_ct[..., 0:3].unsqueeze(1), - -(roi_ry + np.pi / 2), + pos_gt_bboxes_ct[..., 0:3].unsqueeze(1), -roi_ry, axis=2).squeeze(1) # flip orientation if rois have opposite orientation @@ -462,12 +461,13 @@ def _get_target_single(self, pos_bboxes, pos_gt_bboxes, ious, cfg): return (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights, bbox_weights) - def get_corner_loss_lidar(self, pred_bbox3d, gt_bbox3d, delta=1): + def get_corner_loss_lidar(self, pred_bbox3d, gt_bbox3d, delta=1.0): """Calculate corner loss of given boxes. Args: pred_bbox3d (torch.FloatTensor): Predicted boxes in shape (N, 7). gt_bbox3d (torch.FloatTensor): Ground truth boxes in shape (N, 7). + delta (float, optional): huber loss threshold. Defaults to 1.0 Returns: torch.FloatTensor: Calculated corner loss in shape (N). @@ -490,8 +490,8 @@ def get_corner_loss_lidar(self, pred_bbox3d, gt_bbox3d, delta=1): torch.norm(pred_box_corners - gt_box_corners_flip, dim=2)) # (N, 8) # huber loss - abs_error = torch.abs(corner_dist) - quadratic = torch.clamp(abs_error, max=delta) + abs_error = corner_dist.abs() + quadratic = abs_error.clamp(max=delta) linear = (abs_error - quadratic) corner_loss = 0.5 * quadratic**2 + delta * linear @@ -530,8 +530,7 @@ def get_bboxes(self, local_roi_boxes[..., 0:3] = 0 rcnn_boxes3d = self.bbox_coder.decode(local_roi_boxes, bbox_pred) rcnn_boxes3d[..., 0:3] = rotation_3d_in_axis( - rcnn_boxes3d[..., 0:3].unsqueeze(1), (roi_ry + np.pi / 2), - axis=2).squeeze(1) + rcnn_boxes3d[..., 0:3].unsqueeze(1), roi_ry, axis=2).squeeze(1) rcnn_boxes3d[:, 0:3] += roi_xyz # post processing @@ -542,13 +541,13 @@ def get_bboxes(self, cur_box_prob = class_pred[batch_id] cur_rcnn_boxes3d = rcnn_boxes3d[roi_batch_id == batch_id] - selected = self.multi_class_nms(cur_box_prob, cur_rcnn_boxes3d, - cfg.score_thr, cfg.nms_thr, - img_metas[batch_id], - cfg.use_rotate_nms) - selected_bboxes = cur_rcnn_boxes3d[selected] - selected_label_preds = cur_class_labels[selected] - selected_scores = cur_cls_score[selected] + keep = self.multi_class_nms(cur_box_prob, cur_rcnn_boxes3d, + cfg.score_thr, cfg.nms_thr, + img_metas[batch_id], + cfg.use_rotate_nms) + selected_bboxes = cur_rcnn_boxes3d[keep] + selected_label_preds = cur_class_labels[keep] + selected_scores = cur_cls_score[keep] result_list.append( (img_metas[batch_id]['box_type_3d'](selected_bboxes, @@ -576,7 +575,7 @@ def multi_class_nms(self, box_preds (torch.Tensor): Predicted boxes in shape (N, 7+C). score_thr (float): Threshold of scores. nms_thr (float): Threshold for NMS. - input_meta (dict): Meta informations of the current sample. + input_meta (dict): Meta information of the current sample. use_rotate_nms (bool, optional): Whether to use rotated nms. Defaults to True. @@ -620,6 +619,6 @@ def multi_class_nms(self, dtype=torch.int64, device=box_preds.device)) - selected = torch.cat( + keep = torch.cat( selected_list, dim=0) if len(selected_list) > 0 else [] - return selected + return keep diff --git a/mmdet3d/models/roi_heads/bbox_heads/point_rcnn_bbox_head.py b/mmdet3d/models/roi_heads/bbox_heads/point_rcnn_bbox_head.py new file mode 100644 index 0000000000..27c9ca9f48 --- /dev/null +++ b/mmdet3d/models/roi_heads/bbox_heads/point_rcnn_bbox_head.py @@ -0,0 +1,577 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch +from mmcv.cnn import ConvModule, normal_init +from mmcv.cnn.bricks import build_conv_layer +from mmcv.runner import BaseModule +from torch import nn as nn + +from mmdet3d.core.bbox.structures import (LiDARInstance3DBoxes, + rotation_3d_in_axis, xywhr2xyxyr) +from mmdet3d.models.builder import build_loss +from mmdet3d.ops import build_sa_module +from mmdet3d.ops.iou3d.iou3d_utils import nms_gpu, nms_normal_gpu +from mmdet.core import build_bbox_coder, multi_apply +from mmdet.models import HEADS + + +@HEADS.register_module() +class PointRCNNBboxHead(BaseModule): + """PointRCNN RoI Bbox head. + + Args: + num_classes (int): The number of classes to prediction. + in_channels (int): Input channels of point features. + mlp_channels (list[int]): the number of mlp channels + pred_layer_cfg (dict, optional): Config of classfication and + regression prediction layers. Defaults to None. + num_points (tuple, optional): The number of points which each SA + module samples. Defaults to (128, 32, -1). + radius (tuple, optional): Sampling radius of each SA module. + Defaults to (0.2, 0.4, 100). + num_samples (tuple, optional): The number of samples for ball query + in each SA module. Defaults to (64, 64, 64). + sa_channels (tuple, optional): Out channels of each mlp in SA module. + Defaults to ((128, 128, 128), (128, 128, 256), (256, 256, 512)). + bbox_coder (dict, optional): Config dict of box coders. + Defaults to dict(type='DeltaXYZWLHRBBoxCoder'). + sa_cfg (dict, optional): Config of set abstraction module, which may + contain the following keys and values: + + - pool_mod (str): Pool method ('max' or 'avg') for SA modules. + - use_xyz (bool): Whether to use xyz as a part of features. + - normalize_xyz (bool): Whether to normalize xyz with radii in + each SA module. + Defaults to dict(type='PointSAModule', pool_mod='max', + use_xyz=True). + conv_cfg (dict, optional): Config dict of convolutional layers. + Defaults to dict(type='Conv1d'). + norm_cfg (dict, optional): Config dict of normalization layers. + Defaults to dict(type='BN1d'). + act_cfg (dict, optional): Config dict of activation layers. + Defaults to dict(type='ReLU'). + bias (str, optional): Type of bias. Defaults to 'auto'. + loss_bbox (dict, optional): Config of regression loss function. + Defaults to dict(type='SmoothL1Loss', beta=1.0 / 9.0, + reduction='sum', loss_weight=1.0). + loss_cls (dict, optional): Config of classification loss function. + Defaults to dict(type='CrossEntropyLoss', use_sigmoid=True, + reduction='sum', loss_weight=1.0). + with_corner_loss (bool, optional): Whether using corner loss. + Defaults to True. + init_cfg (dict, optional): Config of initialization. Defaults to None. + """ + + def __init__( + self, + num_classes, + in_channels, + mlp_channels, + pred_layer_cfg=None, + num_points=(128, 32, -1), + radius=(0.2, 0.4, 100), + num_samples=(64, 64, 64), + sa_channels=((128, 128, 128), (128, 128, 256), (256, 256, 512)), + bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), + sa_cfg=dict(type='PointSAModule', pool_mod='max', use_xyz=True), + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d'), + act_cfg=dict(type='ReLU'), + bias='auto', + loss_bbox=dict( + type='SmoothL1Loss', + beta=1.0 / 9.0, + reduction='sum', + loss_weight=1.0), + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + reduction='sum', + loss_weight=1.0), + with_corner_loss=True, + init_cfg=None): + super(PointRCNNBboxHead, self).__init__(init_cfg=init_cfg) + self.num_classes = num_classes + self.num_sa = len(sa_channels) + self.with_corner_loss = with_corner_loss + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.bias = bias + + self.loss_bbox = build_loss(loss_bbox) + self.loss_cls = build_loss(loss_cls) + self.bbox_coder = build_bbox_coder(bbox_coder) + self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) + + self.in_channels = in_channels + mlp_channels = [self.in_channels] + mlp_channels + shared_mlps = nn.Sequential() + for i in range(len(mlp_channels) - 1): + shared_mlps.add_module( + f'layer{i}', + ConvModule( + mlp_channels[i], + mlp_channels[i + 1], + kernel_size=(1, 1), + stride=(1, 1), + inplace=False, + conv_cfg=dict(type='Conv2d'))) + self.xyz_up_layer = nn.Sequential(*shared_mlps) + + c_out = mlp_channels[-1] + self.merge_down_layer = ConvModule( + c_out * 2, + c_out, + kernel_size=(1, 1), + stride=(1, 1), + inplace=False, + conv_cfg=dict(type='Conv2d')) + + pre_channels = c_out + + self.SA_modules = nn.ModuleList() + sa_in_channel = pre_channels + + for sa_index in range(self.num_sa): + cur_sa_mlps = list(sa_channels[sa_index]) + cur_sa_mlps = [sa_in_channel] + cur_sa_mlps + sa_out_channel = cur_sa_mlps[-1] + + cur_num_points = num_points[sa_index] + if cur_num_points <= 0: + cur_num_points = None + self.SA_modules.append( + build_sa_module( + num_point=cur_num_points, + radius=radius[sa_index], + num_sample=num_samples[sa_index], + mlp_channels=cur_sa_mlps, + cfg=sa_cfg)) + sa_in_channel = sa_out_channel + self.cls_convs = self._add_conv_branch( + pred_layer_cfg.in_channels, pred_layer_cfg.cls_conv_channels) + self.reg_convs = self._add_conv_branch( + pred_layer_cfg.in_channels, pred_layer_cfg.reg_conv_channels) + + prev_channel = pred_layer_cfg.cls_conv_channels[-1] + self.conv_cls = build_conv_layer( + self.conv_cfg, + in_channels=prev_channel, + out_channels=self.num_classes, + kernel_size=1) + prev_channel = pred_layer_cfg.reg_conv_channels[-1] + self.conv_reg = build_conv_layer( + self.conv_cfg, + in_channels=prev_channel, + out_channels=self.bbox_coder.code_size * self.num_classes, + kernel_size=1) + + if init_cfg is None: + self.init_cfg = dict(type='Xavier', layer=['Conv2d', 'Conv1d']) + + def _add_conv_branch(self, in_channels, conv_channels): + """Add shared or separable branch. + + Args: + in_channels (int): Input feature channel. + conv_channels (tuple): Middle feature channels. + """ + conv_spec = [in_channels] + list(conv_channels) + # add branch specific conv layers + conv_layers = nn.Sequential() + for i in range(len(conv_spec) - 1): + conv_layers.add_module( + f'layer{i}', + ConvModule( + conv_spec[i], + conv_spec[i + 1], + kernel_size=1, + padding=0, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + bias=self.bias, + inplace=True)) + return conv_layers + + def init_weights(self): + """Initialize weights of the head.""" + super().init_weights() + for m in self.modules(): + if isinstance(m, nn.Conv2d) or isinstance(m, nn.Conv1d): + if m.bias is not None: + nn.init.constant_(m.bias, 0) + normal_init(self.conv_reg.weight, mean=0, std=0.001) + + def forward(self, feats): + """Forward pass. + + Args: + feats (torch.Torch): Features from RCNN modules. + + Returns: + tuple[torch.Tensor]: Score of class and bbox predictions. + """ + input_data = feats.clone().detach() + xyz_input = input_data[..., 0:self.in_channels].transpose( + 1, 2).unsqueeze(dim=3).contiguous().clone().detach() + xyz_features = self.xyz_up_layer(xyz_input) + rpn_features = input_data[..., self.in_channels:].transpose( + 1, 2).unsqueeze(dim=3) + merged_features = torch.cat((xyz_features, rpn_features), dim=1) + merged_features = self.merge_down_layer(merged_features) + l_xyz, l_features = [input_data[..., 0:3].contiguous()], \ + [merged_features.squeeze(dim=3)] + for i in range(len(self.SA_modules)): + li_xyz, li_features, cur_indices = \ + self.SA_modules[i](l_xyz[i], l_features[i]) + l_xyz.append(li_xyz) + l_features.append(li_features) + + shared_features = l_features[-1] + x_cls = shared_features + x_reg = shared_features + x_cls = self.cls_convs(x_cls) + rcnn_cls = self.conv_cls(x_cls) + x_reg = self.reg_convs(x_reg) + rcnn_reg = self.conv_reg(x_reg) + rcnn_cls = rcnn_cls.transpose(1, 2).contiguous().squeeze(dim=1) + rcnn_reg = rcnn_reg.transpose(1, 2).contiguous().squeeze(dim=1) + return (rcnn_cls, rcnn_reg) + + def loss(self, cls_score, bbox_pred, rois, labels, bbox_targets, + pos_gt_bboxes, reg_mask, label_weights, bbox_weights): + """Computing losses. + + Args: + cls_score (torch.Tensor): Scores of each RoI. + bbox_pred (torch.Tensor): Predictions of bboxes. + rois (torch.Tensor): RoI bboxes. + labels (torch.Tensor): Labels of class. + bbox_targets (torch.Tensor): Target of positive bboxes. + pos_gt_bboxes (torch.Tensor): Ground truths of positive bboxes. + reg_mask (torch.Tensor): Mask for positive bboxes. + label_weights (torch.Tensor): Weights of class loss. + bbox_weights (torch.Tensor): Weights of bbox loss. + + Returns: + dict: Computed losses. + + - loss_cls (torch.Tensor): Loss of classes. + - loss_bbox (torch.Tensor): Loss of bboxes. + - loss_corner (torch.Tensor): Loss of corners. + """ + losses = dict() + rcnn_batch_size = cls_score.shape[0] + # calculate class loss + cls_flat = cls_score.view(-1) + loss_cls = self.loss_cls(cls_flat, labels, label_weights) + losses['loss_cls'] = loss_cls + + # calculate regression loss + code_size = self.bbox_coder.code_size + pos_inds = (reg_mask > 0) + + pos_bbox_pred = bbox_pred.view(rcnn_batch_size, -1)[pos_inds].clone() + bbox_weights_flat = bbox_weights[pos_inds].view(-1, 1).repeat( + 1, pos_bbox_pred.shape[-1]) + loss_bbox = self.loss_bbox( + pos_bbox_pred.unsqueeze(dim=0), + bbox_targets.unsqueeze(dim=0).detach(), + bbox_weights_flat.unsqueeze(dim=0)) + losses['loss_bbox'] = loss_bbox + + if pos_inds.any() != 0 and self.with_corner_loss: + rois = rois.detach() + pos_roi_boxes3d = rois[..., 1:].view(-1, code_size)[pos_inds] + pos_roi_boxes3d = pos_roi_boxes3d.view(-1, code_size) + batch_anchors = pos_roi_boxes3d.clone().detach() + pos_rois_rotation = pos_roi_boxes3d[..., 6].view(-1) + roi_xyz = pos_roi_boxes3d[..., 0:3].view(-1, 3) + batch_anchors[..., 0:3] = 0 + # decode boxes + pred_boxes3d = self.bbox_coder.decode( + batch_anchors, + pos_bbox_pred.view(-1, code_size)).view(-1, code_size) + + pred_boxes3d[..., 0:3] = rotation_3d_in_axis( + pred_boxes3d[..., 0:3].unsqueeze(1), (pos_rois_rotation), + axis=2).squeeze(1) + + pred_boxes3d[:, 0:3] += roi_xyz + + # calculate corner loss + loss_corner = self.get_corner_loss_lidar(pred_boxes3d, + pos_gt_bboxes) + + losses['loss_corner'] = loss_corner + else: + losses['loss_corner'] = loss_cls.new_tensor(0) + + return losses + + def get_corner_loss_lidar(self, pred_bbox3d, gt_bbox3d, delta=1.0): + """Calculate corner loss of given boxes. + + Args: + pred_bbox3d (torch.FloatTensor): Predicted boxes in shape (N, 7). + gt_bbox3d (torch.FloatTensor): Ground truth boxes in shape (N, 7). + delta (float, optional): huber loss threshold. Defaults to 1.0 + + Returns: + torch.FloatTensor: Calculated corner loss in shape (N). + """ + assert pred_bbox3d.shape[0] == gt_bbox3d.shape[0] + + # This is a little bit hack here because we assume the box for + # PointRCNN is in LiDAR coordinates + + gt_boxes_structure = LiDARInstance3DBoxes(gt_bbox3d) + pred_box_corners = LiDARInstance3DBoxes(pred_bbox3d).corners + gt_box_corners = gt_boxes_structure.corners + + # This flip only changes the heading direction of GT boxes + gt_bbox3d_flip = gt_boxes_structure.clone() + gt_bbox3d_flip.tensor[:, 6] += np.pi + gt_box_corners_flip = gt_bbox3d_flip.corners + + corner_dist = torch.min( + torch.norm(pred_box_corners - gt_box_corners, dim=2), + torch.norm(pred_box_corners - gt_box_corners_flip, dim=2)) + # huber loss + abs_error = corner_dist.abs() + quadratic = abs_error.clamp(max=delta) + linear = (abs_error - quadratic) + corner_loss = 0.5 * quadratic**2 + delta * linear + return corner_loss.mean(dim=1) + + def get_targets(self, sampling_results, rcnn_train_cfg, concat=True): + """Generate targets. + + Args: + sampling_results (list[:obj:`SamplingResult`]): + Sampled results from rois. + rcnn_train_cfg (:obj:`ConfigDict`): Training config of rcnn. + concat (bool, optional): Whether to concatenate targets between + batches. Defaults to True. + + Returns: + tuple[torch.Tensor]: Targets of boxes and class prediction. + """ + pos_bboxes_list = [res.pos_bboxes for res in sampling_results] + pos_gt_bboxes_list = [res.pos_gt_bboxes for res in sampling_results] + iou_list = [res.iou for res in sampling_results] + targets = multi_apply( + self._get_target_single, + pos_bboxes_list, + pos_gt_bboxes_list, + iou_list, + cfg=rcnn_train_cfg) + (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights, + bbox_weights) = targets + + if concat: + label = torch.cat(label, 0) + bbox_targets = torch.cat(bbox_targets, 0) + pos_gt_bboxes = torch.cat(pos_gt_bboxes, 0) + reg_mask = torch.cat(reg_mask, 0) + + label_weights = torch.cat(label_weights, 0) + label_weights /= torch.clamp(label_weights.sum(), min=1.0) + + bbox_weights = torch.cat(bbox_weights, 0) + bbox_weights /= torch.clamp(bbox_weights.sum(), min=1.0) + + return (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights, + bbox_weights) + + def _get_target_single(self, pos_bboxes, pos_gt_bboxes, ious, cfg): + """Generate training targets for a single sample. + + Args: + pos_bboxes (torch.Tensor): Positive boxes with shape + (N, 7). + pos_gt_bboxes (torch.Tensor): Ground truth boxes with shape + (M, 7). + ious (torch.Tensor): IoU between `pos_bboxes` and `pos_gt_bboxes` + in shape (N, M). + cfg (dict): Training configs. + + Returns: + tuple[torch.Tensor]: Target for positive boxes. + (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights, + bbox_weights) + """ + cls_pos_mask = ious > cfg.cls_pos_thr + cls_neg_mask = ious < cfg.cls_neg_thr + interval_mask = (cls_pos_mask == 0) & (cls_neg_mask == 0) + # iou regression target + label = (cls_pos_mask > 0).float() + label[interval_mask] = (ious[interval_mask] - cfg.cls_neg_thr) / \ + (cfg.cls_pos_thr - cfg.cls_neg_thr) + # label weights + label_weights = (label >= 0).float() + # box regression target + reg_mask = pos_bboxes.new_zeros(ious.size(0)).long() + reg_mask[0:pos_gt_bboxes.size(0)] = 1 + bbox_weights = (reg_mask > 0).float() + if reg_mask.bool().any(): + pos_gt_bboxes_ct = pos_gt_bboxes.clone().detach() + roi_center = pos_bboxes[..., 0:3] + roi_ry = pos_bboxes[..., 6] % (2 * np.pi) + + # canonical transformation + pos_gt_bboxes_ct[..., 0:3] -= roi_center + pos_gt_bboxes_ct[..., 6] -= roi_ry + pos_gt_bboxes_ct[..., 0:3] = rotation_3d_in_axis( + pos_gt_bboxes_ct[..., 0:3].unsqueeze(1), -(roi_ry), + axis=2).squeeze(1) + + # flip orientation if gt have opposite orientation + ry_label = pos_gt_bboxes_ct[..., 6] % (2 * np.pi) # 0 ~ 2pi + is_opposite = (ry_label > np.pi * 0.5) & (ry_label < np.pi * 1.5) + ry_label[is_opposite] = (ry_label[is_opposite] + np.pi) % ( + 2 * np.pi) # (0 ~ pi/2, 3pi/2 ~ 2pi) + flag = ry_label > np.pi + ry_label[flag] = ry_label[flag] - np.pi * 2 # (-pi/2, pi/2) + ry_label = torch.clamp(ry_label, min=-np.pi / 2, max=np.pi / 2) + pos_gt_bboxes_ct[..., 6] = ry_label + + rois_anchor = pos_bboxes.clone().detach() + rois_anchor[:, 0:3] = 0 + rois_anchor[:, 6] = 0 + bbox_targets = self.bbox_coder.encode(rois_anchor, + pos_gt_bboxes_ct) + else: + # no fg bbox + bbox_targets = pos_gt_bboxes.new_empty((0, 7)) + + return (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights, + bbox_weights) + + def get_bboxes(self, + rois, + cls_score, + bbox_pred, + class_labels, + img_metas, + cfg=None): + """Generate bboxes from bbox head predictions. + + Args: + rois (torch.Tensor): RoI bounding boxes. + cls_score (torch.Tensor): Scores of bounding boxes. + bbox_pred (torch.Tensor): Bounding boxes predictions + class_labels (torch.Tensor): Label of classes + img_metas (list[dict]): Point cloud and image's meta info. + cfg (:obj:`ConfigDict`, optional): Testing config. + Defaults to None. + + Returns: + list[tuple]: Decoded bbox, scores and labels after nms. + """ + roi_batch_id = rois[..., 0] + roi_boxes = rois[..., 1:] # boxes without batch id + batch_size = int(roi_batch_id.max().item() + 1) + + # decode boxes + roi_ry = roi_boxes[..., 6].view(-1) + roi_xyz = roi_boxes[..., 0:3].view(-1, 3) + local_roi_boxes = roi_boxes.clone().detach() + local_roi_boxes[..., 0:3] = 0 + rcnn_boxes3d = self.bbox_coder.decode(local_roi_boxes, bbox_pred) + rcnn_boxes3d[..., 0:3] = rotation_3d_in_axis( + rcnn_boxes3d[..., 0:3].unsqueeze(1), (roi_ry), axis=2).squeeze(1) + rcnn_boxes3d[:, 0:3] += roi_xyz + + # post processing + result_list = [] + for batch_id in range(batch_size): + cur_class_labels = class_labels[batch_id] + cur_cls_score = cls_score[roi_batch_id == batch_id].view(-1) + + cur_box_prob = cls_score[batch_id] + cur_box_prob = cur_cls_score.unsqueeze(1) + cur_rcnn_boxes3d = rcnn_boxes3d[roi_batch_id == batch_id] + keep = self.multi_class_nms(cur_box_prob, cur_rcnn_boxes3d, + cfg.score_thr, cfg.nms_thr, + img_metas[batch_id], + cfg.use_rotate_nms) + selected_bboxes = cur_rcnn_boxes3d[keep] + selected_label_preds = cur_class_labels[keep] + selected_scores = cur_cls_score[keep] + + result_list.append( + (img_metas[batch_id]['box_type_3d'](selected_bboxes, + self.bbox_coder.code_size), + selected_scores, selected_label_preds)) + return result_list + + def multi_class_nms(self, + box_probs, + box_preds, + score_thr, + nms_thr, + input_meta, + use_rotate_nms=True): + """Multi-class NMS for box head. + + Note: + This function has large overlap with the `box3d_multiclass_nms` + implemented in `mmdet3d.core.post_processing`. We are considering + merging these two functions in the future. + + Args: + box_probs (torch.Tensor): Predicted boxes probabitilies in + shape (N,). + box_preds (torch.Tensor): Predicted boxes in shape (N, 7+C). + score_thr (float): Threshold of scores. + nms_thr (float): Threshold for NMS. + input_meta (dict): Meta information of the current sample. + use_rotate_nms (bool, optional): Whether to use rotated nms. + Defaults to True. + + Returns: + torch.Tensor: Selected indices. + """ + if use_rotate_nms: + nms_func = nms_gpu + else: + nms_func = nms_normal_gpu + + assert box_probs.shape[ + 1] == self.num_classes, f'box_probs shape: {str(box_probs.shape)}' + selected_list = [] + selected_labels = [] + boxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d']( + box_preds, self.bbox_coder.code_size).bev) + + score_thresh = score_thr if isinstance( + score_thr, list) else [score_thr for x in range(self.num_classes)] + nms_thresh = nms_thr if isinstance( + nms_thr, list) else [nms_thr for x in range(self.num_classes)] + for k in range(0, self.num_classes): + class_scores_keep = box_probs[:, k] >= score_thresh[k] + + if class_scores_keep.int().sum() > 0: + original_idxs = class_scores_keep.nonzero( + as_tuple=False).view(-1) + cur_boxes_for_nms = boxes_for_nms[class_scores_keep] + cur_rank_scores = box_probs[class_scores_keep, k] + + cur_selected = nms_func(cur_boxes_for_nms, cur_rank_scores, + nms_thresh[k]) + + if cur_selected.shape[0] == 0: + continue + selected_list.append(original_idxs[cur_selected]) + selected_labels.append( + torch.full([cur_selected.shape[0]], + k + 1, + dtype=torch.int64, + device=box_preds.device)) + + keep = torch.cat( + selected_list, dim=0) if len(selected_list) > 0 else [] + return keep diff --git a/mmdet3d/models/roi_heads/h3d_roi_head.py b/mmdet3d/models/roi_heads/h3d_roi_head.py index ba5ef1e7b0..4bf8cf36b6 100644 --- a/mmdet3d/models/roi_heads/h3d_roi_head.py +++ b/mmdet3d/models/roi_heads/h3d_roi_head.py @@ -65,15 +65,15 @@ def forward_train(self, feats_dict (dict): Contains features from the first stage. img_metas (list[dict]): Contain pcd and img's meta info. points (list[torch.Tensor]): Input points. - gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \ + gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth bboxes of each sample. gt_labels_3d (list[torch.Tensor]): Labels of each sample. - pts_semantic_mask (None | list[torch.Tensor]): Point-wise + pts_semantic_mask (list[torch.Tensor]): Point-wise semantic mask. - pts_instance_mask (None | list[torch.Tensor]): Point-wise + pts_instance_mask (list[torch.Tensor]): Point-wise instance mask. - gt_bboxes_ignore (None | list[torch.Tensor]): Specify - which bounding. + gt_bboxes_ignore (list[torch.Tensor]): Specify + which bounding boxes to ignore. Returns: dict: losses from each head. diff --git a/mmdet3d/models/roi_heads/mask_heads/pointwise_semantic_head.py b/mmdet3d/models/roi_heads/mask_heads/pointwise_semantic_head.py index e3b4e453e0..fbdd2f0da3 100644 --- a/mmdet3d/models/roi_heads/mask_heads/pointwise_semantic_head.py +++ b/mmdet3d/models/roi_heads/mask_heads/pointwise_semantic_head.py @@ -83,15 +83,15 @@ def get_targets_single(self, voxel_centers, gt_bboxes_3d, gt_labels_3d): sample. Args: - voxel_centers (torch.Tensor): The center of voxels in shape \ + voxel_centers (torch.Tensor): The center of voxels in shape (voxel_num, 3). - gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth boxes in \ + gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth boxes in shape (box_num, 7). - gt_labels_3d (torch.Tensor): Class labels of ground truths in \ + gt_labels_3d (torch.Tensor): Class labels of ground truths in shape (box_num). Returns: - tuple[torch.Tensor]: Segmentation targets with shape [voxel_num] \ + tuple[torch.Tensor]: Segmentation targets with shape [voxel_num] part prediction targets with shape [voxel_num, 3] """ gt_bboxes_3d = gt_bboxes_3d.to(voxel_centers.device) @@ -99,8 +99,8 @@ def get_targets_single(self, voxel_centers, gt_bboxes_3d, gt_labels_3d): part_targets = voxel_centers.new_zeros((voxel_centers.shape[0], 3), dtype=torch.float32) - box_idx = gt_bboxes_3d.points_in_boxes(voxel_centers) - enlarge_box_idx = enlarged_gt_boxes.points_in_boxes( + box_idx = gt_bboxes_3d.points_in_boxes_part(voxel_centers) + enlarge_box_idx = enlarged_gt_boxes.points_in_boxes_part( voxel_centers).long() gt_labels_pad = F.pad( @@ -131,19 +131,19 @@ def get_targets(self, voxels_dict, gt_bboxes_3d, gt_labels_3d): """generate segmentation and part prediction targets. Args: - voxel_centers (torch.Tensor): The center of voxels in shape \ + voxel_centers (torch.Tensor): The center of voxels in shape (voxel_num, 3). - gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth boxes in \ + gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth boxes in shape (box_num, 7). - gt_labels_3d (torch.Tensor): Class labels of ground truths in \ + gt_labels_3d (torch.Tensor): Class labels of ground truths in shape (box_num). Returns: dict: Prediction targets - - seg_targets (torch.Tensor): Segmentation targets \ + - seg_targets (torch.Tensor): Segmentation targets with shape [voxel_num]. - - part_targets (torch.Tensor): Part prediction targets \ + - part_targets (torch.Tensor): Part prediction targets with shape [voxel_num, 3]. """ batch_size = len(gt_labels_3d) diff --git a/mmdet3d/models/roi_heads/mask_heads/primitive_head.py b/mmdet3d/models/roi_heads/mask_heads/primitive_head.py index a62a1b55a5..6197fd52c5 100644 --- a/mmdet3d/models/roi_heads/mask_heads/primitive_head.py +++ b/mmdet3d/models/roi_heads/mask_heads/primitive_head.py @@ -20,7 +20,7 @@ class PrimitiveHead(BaseModule): num_dims (int): The dimension of primitive semantic information. num_classes (int): The number of class. primitive_mode (str): The mode of primitive module, - avaliable mode ['z', 'xy', 'line']. + available mode ['z', 'xy', 'line']. bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for encoding and decoding boxes. train_cfg (dict): Config for training. @@ -30,7 +30,7 @@ class PrimitiveHead(BaseModule): feat_channels (tuple[int]): Convolution channels of prediction layer. upper_thresh (float): Threshold for line matching. - surface_thresh (float): Threshold for suface matching. + surface_thresh (float): Threshold for surface matching. conv_cfg (dict): Config of convolution in prediction layer. norm_cfg (dict): Config of BN in prediction layer. objectness_loss (dict): Config of objectness loss. @@ -198,15 +198,15 @@ def loss(self, Args: bbox_preds (dict): Predictions from forward of primitive head. points (list[torch.Tensor]): Input points. - gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \ + gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth bboxes of each sample. gt_labels_3d (list[torch.Tensor]): Labels of each sample. - pts_semantic_mask (None | list[torch.Tensor]): Point-wise + pts_semantic_mask (list[torch.Tensor]): Point-wise semantic mask. - pts_instance_mask (None | list[torch.Tensor]): Point-wise + pts_instance_mask (list[torch.Tensor]): Point-wise instance mask. img_metas (list[dict]): Contain pcd and img's meta info. - gt_bboxes_ignore (None | list[torch.Tensor]): Specify + gt_bboxes_ignore (list[torch.Tensor]): Specify which bounding. Returns: @@ -266,12 +266,12 @@ def get_targets(self, Args: points (list[torch.Tensor]): Points of each batch. - gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \ + gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth bboxes of each batch. gt_labels_3d (list[torch.Tensor]): Labels of each batch. - pts_semantic_mask (None | list[torch.Tensor]): Point-wise semantic + pts_semantic_mask (list[torch.Tensor]): Point-wise semantic label of each batch. - pts_instance_mask (None | list[torch.Tensor]): Point-wise instance + pts_instance_mask (list[torch.Tensor]): Point-wise instance label of each batch. bbox_preds (dict): Predictions from forward of primitive head. @@ -333,12 +333,12 @@ def get_targets_single(self, Args: points (torch.Tensor): Points of each batch. - gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth \ + gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth boxes of each batch. gt_labels_3d (torch.Tensor): Labels of each batch. - pts_semantic_mask (None | torch.Tensor): Point-wise semantic + pts_semantic_mask (torch.Tensor): Point-wise semantic label of each batch. - pts_instance_mask (None | torch.Tensor): Point-wise instance + pts_instance_mask (torch.Tensor): Point-wise instance label of each batch. Returns: @@ -355,7 +355,7 @@ def get_targets_single(self, # Generate pts_semantic_mask and pts_instance_mask when they are None if pts_semantic_mask is None or pts_instance_mask is None: - points2box_mask = gt_bboxes_3d.points_in_boxes(points) + points2box_mask = gt_bboxes_3d.points_in_boxes_all(points) assignment = points2box_mask.argmax(1) background_mask = points2box_mask.max(1)[0] == 0 diff --git a/mmdet3d/models/roi_heads/part_aggregation_roi_head.py b/mmdet3d/models/roi_heads/part_aggregation_roi_head.py index 7b6e2f656d..2e15fd3668 100644 --- a/mmdet3d/models/roi_heads/part_aggregation_roi_head.py +++ b/mmdet3d/models/roi_heads/part_aggregation_roi_head.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import warnings + from torch.nn import functional as F from mmdet3d.core import AssignResult diff --git a/mmdet3d/models/roi_heads/point_rcnn_roi_head.py b/mmdet3d/models/roi_heads/point_rcnn_roi_head.py new file mode 100644 index 0000000000..14f5271633 --- /dev/null +++ b/mmdet3d/models/roi_heads/point_rcnn_roi_head.py @@ -0,0 +1,287 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch.nn import functional as F + +from mmdet3d.core import AssignResult +from mmdet3d.core.bbox import bbox3d2result, bbox3d2roi +from mmdet.core import build_assigner, build_sampler +from mmdet.models import HEADS +from ..builder import build_head, build_roi_extractor +from .base_3droi_head import Base3DRoIHead + + +@HEADS.register_module() +class PointRCNNRoIHead(Base3DRoIHead): + """RoI head for PointRCNN. + + Args: + bbox_head (dict): Config of bbox_head. + point_roi_extractor (dict): Config of RoI extractor. + train_cfg (dict): Train configs. + test_cfg (dict): Test configs. + depth_normalizer (float, optional): Normalize depth feature. + Defaults to 70.0. + init_cfg (dict, optional): Config of initialization. Defaults to None. + """ + + def __init__(self, + bbox_head, + point_roi_extractor, + train_cfg, + test_cfg, + depth_normalizer=70.0, + pretrained=None, + init_cfg=None): + super(PointRCNNRoIHead, self).__init__( + bbox_head=bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + pretrained=pretrained, + init_cfg=init_cfg) + self.depth_normalizer = depth_normalizer + + if point_roi_extractor is not None: + self.point_roi_extractor = build_roi_extractor(point_roi_extractor) + + self.init_assigner_sampler() + + def init_bbox_head(self, bbox_head): + """Initialize box head. + + Args: + bbox_head (dict): Config dict of RoI Head. + """ + self.bbox_head = build_head(bbox_head) + + def init_mask_head(self): + """Initialize maek head.""" + pass + + def init_assigner_sampler(self): + """Initialize assigner and sampler.""" + self.bbox_assigner = None + self.bbox_sampler = None + if self.train_cfg: + if isinstance(self.train_cfg.assigner, dict): + self.bbox_assigner = build_assigner(self.train_cfg.assigner) + elif isinstance(self.train_cfg.assigner, list): + self.bbox_assigner = [ + build_assigner(res) for res in self.train_cfg.assigner + ] + self.bbox_sampler = build_sampler(self.train_cfg.sampler) + + def forward_train(self, feats_dict, input_metas, proposal_list, + gt_bboxes_3d, gt_labels_3d): + """Training forward function of PointRCNNRoIHead. + + Args: + feats_dict (dict): Contains features from the first stage. + imput_metas (list[dict]): Meta info of each input. + proposal_list (list[dict]): Proposal information from rpn. + The dictionary should contain the following keys: + + - boxes_3d (:obj:`BaseInstance3DBoxes`): Proposal bboxes + - labels_3d (torch.Tensor): Labels of proposals + gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): + GT bboxes of each sample. The bboxes are encapsulated + by 3D box structures. + gt_labels_3d (list[LongTensor]): GT labels of each sample. + + Returns: + dict: Losses from RoI RCNN head. + - loss_bbox (torch.Tensor): Loss of bboxes + """ + features = feats_dict['features'] + points = feats_dict['points'] + point_cls_preds = feats_dict['points_cls_preds'] + sem_scores = point_cls_preds.sigmoid() + point_scores = sem_scores.max(-1)[0] + + sample_results = self._assign_and_sample(proposal_list, gt_bboxes_3d, + gt_labels_3d) + + # concat the depth, semantic features and backbone features + features = features.transpose(1, 2).contiguous() + point_depths = points.norm(dim=2) / self.depth_normalizer - 0.5 + features_list = [ + point_scores.unsqueeze(2), + point_depths.unsqueeze(2), features + ] + features = torch.cat(features_list, dim=2) + + bbox_results = self._bbox_forward_train(features, points, + sample_results) + losses = dict() + losses.update(bbox_results['loss_bbox']) + + return losses + + def simple_test(self, feats_dict, img_metas, proposal_list, **kwargs): + """Simple testing forward function of PointRCNNRoIHead. + + Note: + This function assumes that the batch size is 1 + + Args: + feats_dict (dict): Contains features from the first stage. + img_metas (list[dict]): Meta info of each image. + proposal_list (list[dict]): Proposal information from rpn. + + Returns: + dict: Bbox results of one frame. + """ + rois = bbox3d2roi([res['boxes_3d'].tensor for res in proposal_list]) + labels_3d = [res['labels_3d'] for res in proposal_list] + + features = feats_dict['features'] + points = feats_dict['points'] + point_cls_preds = feats_dict['points_cls_preds'] + sem_scores = point_cls_preds.sigmoid() + point_scores = sem_scores.max(-1)[0] + + features = features.transpose(1, 2).contiguous() + point_depths = points.norm(dim=2) / self.depth_normalizer - 0.5 + features_list = [ + point_scores.unsqueeze(2), + point_depths.unsqueeze(2), features + ] + + features = torch.cat(features_list, dim=2) + batch_size = features.shape[0] + bbox_results = self._bbox_forward(features, points, batch_size, rois) + object_score = bbox_results['cls_score'].sigmoid() + bbox_list = self.bbox_head.get_bboxes( + rois, + object_score, + bbox_results['bbox_pred'], + labels_3d, + img_metas, + cfg=self.test_cfg) + + bbox_results = [ + bbox3d2result(bboxes, scores, labels) + for bboxes, scores, labels in bbox_list + ] + return bbox_results + + def _bbox_forward_train(self, features, points, sampling_results): + """Forward training function of roi_extractor and bbox_head. + + Args: + features (torch.Tensor): Backbone features with depth and \ + semantic features. + points (torch.Tensor): Pointcloud. + sampling_results (:obj:`SamplingResult`): Sampled results used + for training. + + Returns: + dict: Forward results including losses and predictions. + """ + rois = bbox3d2roi([res.bboxes for res in sampling_results]) + batch_size = features.shape[0] + bbox_results = self._bbox_forward(features, points, batch_size, rois) + bbox_targets = self.bbox_head.get_targets(sampling_results, + self.train_cfg) + + loss_bbox = self.bbox_head.loss(bbox_results['cls_score'], + bbox_results['bbox_pred'], rois, + *bbox_targets) + + bbox_results.update(loss_bbox=loss_bbox) + return bbox_results + + def _bbox_forward(self, features, points, batch_size, rois): + """Forward function of roi_extractor and bbox_head used in both + training and testing. + + Args: + features (torch.Tensor): Backbone features with depth and + semantic features. + points (torch.Tensor): Pointcloud. + batch_size (int): Batch size. + rois (torch.Tensor): RoI boxes. + + Returns: + dict: Contains predictions of bbox_head and + features of roi_extractor. + """ + pooled_point_feats = self.point_roi_extractor(features, points, + batch_size, rois) + + cls_score, bbox_pred = self.bbox_head(pooled_point_feats) + bbox_results = dict(cls_score=cls_score, bbox_pred=bbox_pred) + return bbox_results + + def _assign_and_sample(self, proposal_list, gt_bboxes_3d, gt_labels_3d): + """Assign and sample proposals for training. + + Args: + proposal_list (list[dict]): Proposals produced by RPN. + gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth + boxes. + gt_labels_3d (list[torch.Tensor]): Ground truth labels + + Returns: + list[:obj:`SamplingResult`]: Sampled results of each training + sample. + """ + sampling_results = [] + # bbox assign + for batch_idx in range(len(proposal_list)): + cur_proposal_list = proposal_list[batch_idx] + cur_boxes = cur_proposal_list['boxes_3d'] + cur_labels_3d = cur_proposal_list['labels_3d'] + cur_gt_bboxes = gt_bboxes_3d[batch_idx].to(cur_boxes.device) + cur_gt_labels = gt_labels_3d[batch_idx] + batch_num_gts = 0 + # 0 is bg + batch_gt_indis = cur_gt_labels.new_full((len(cur_boxes), ), 0) + batch_max_overlaps = cur_boxes.tensor.new_zeros(len(cur_boxes)) + # -1 is bg + batch_gt_labels = cur_gt_labels.new_full((len(cur_boxes), ), -1) + + # each class may have its own assigner + if isinstance(self.bbox_assigner, list): + for i, assigner in enumerate(self.bbox_assigner): + gt_per_cls = (cur_gt_labels == i) + pred_per_cls = (cur_labels_3d == i) + cur_assign_res = assigner.assign( + cur_boxes.tensor[pred_per_cls], + cur_gt_bboxes.tensor[gt_per_cls], + gt_labels=cur_gt_labels[gt_per_cls]) + # gather assign_results in different class into one result + batch_num_gts += cur_assign_res.num_gts + # gt inds (1-based) + gt_inds_arange_pad = gt_per_cls.nonzero( + as_tuple=False).view(-1) + 1 + # pad 0 for indice unassigned + gt_inds_arange_pad = F.pad( + gt_inds_arange_pad, (1, 0), mode='constant', value=0) + # pad -1 for indice ignore + gt_inds_arange_pad = F.pad( + gt_inds_arange_pad, (1, 0), mode='constant', value=-1) + # convert to 0~gt_num+2 for indices + gt_inds_arange_pad += 1 + # now 0 is bg, >1 is fg in batch_gt_indis + batch_gt_indis[pred_per_cls] = gt_inds_arange_pad[ + cur_assign_res.gt_inds + 1] - 1 + batch_max_overlaps[ + pred_per_cls] = cur_assign_res.max_overlaps + batch_gt_labels[pred_per_cls] = cur_assign_res.labels + + assign_result = AssignResult(batch_num_gts, batch_gt_indis, + batch_max_overlaps, + batch_gt_labels) + else: # for single class + assign_result = self.bbox_assigner.assign( + cur_boxes.tensor, + cur_gt_bboxes.tensor, + gt_labels=cur_gt_labels) + + # sample boxes + sampling_result = self.bbox_sampler.sample(assign_result, + cur_boxes.tensor, + cur_gt_bboxes.tensor, + cur_gt_labels) + sampling_results.append(sampling_result) + return sampling_results diff --git a/mmdet3d/models/roi_heads/roi_extractors/__init__.py b/mmdet3d/models/roi_heads/roi_extractors/__init__.py index da0be4f7d2..70c28812bb 100644 --- a/mmdet3d/models/roi_heads/roi_extractors/__init__.py +++ b/mmdet3d/models/roi_heads/roi_extractors/__init__.py @@ -1,5 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. from mmdet.models.roi_heads.roi_extractors import SingleRoIExtractor from .single_roiaware_extractor import Single3DRoIAwareExtractor +from .single_roipoint_extractor import Single3DRoIPointExtractor -__all__ = ['SingleRoIExtractor', 'Single3DRoIAwareExtractor'] +__all__ = [ + 'SingleRoIExtractor', 'Single3DRoIAwareExtractor', + 'Single3DRoIPointExtractor' +] diff --git a/mmdet3d/models/roi_heads/roi_extractors/single_roipoint_extractor.py b/mmdet3d/models/roi_heads/roi_extractors/single_roipoint_extractor.py new file mode 100644 index 0000000000..b8c287ed64 --- /dev/null +++ b/mmdet3d/models/roi_heads/roi_extractors/single_roipoint_extractor.py @@ -0,0 +1,64 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch import nn as nn + +from mmdet3d import ops +from mmdet3d.core.bbox.structures import rotation_3d_in_axis +from mmdet.models.builder import ROI_EXTRACTORS + + +@ROI_EXTRACTORS.register_module() +class Single3DRoIPointExtractor(nn.Module): + """Point-wise roi-aware Extractor. + + Extract Point-wise roi features. + + Args: + roi_layer (dict): The config of roi layer. + """ + + def __init__(self, roi_layer=None): + super(Single3DRoIPointExtractor, self).__init__() + self.roi_layer = self.build_roi_layers(roi_layer) + + def build_roi_layers(self, layer_cfg): + """Build roi layers using `layer_cfg`""" + cfg = layer_cfg.copy() + layer_type = cfg.pop('type') + assert hasattr(ops, layer_type) + layer_cls = getattr(ops, layer_type) + roi_layers = layer_cls(**cfg) + return roi_layers + + def forward(self, feats, coordinate, batch_inds, rois): + """Extract point-wise roi features. + + Args: + feats (torch.FloatTensor): Point-wise features with + shape (batch, npoints, channels) for pooling. + coordinate (torch.FloatTensor): Coordinate of each point. + batch_inds (torch.LongTensor): Indicate the batch of each point. + rois (torch.FloatTensor): Roi boxes with batch indices. + + Returns: + torch.FloatTensor: Pooled features + """ + rois = rois[..., 1:] + rois = rois.view(batch_inds, -1, rois.shape[-1]) + with torch.no_grad(): + pooled_roi_feat, pooled_empty_flag = self.roi_layer( + coordinate, feats, rois) + + # canonical transformation + roi_center = rois[:, :, 0:3] + pooled_roi_feat[:, :, :, 0:3] -= roi_center.unsqueeze(dim=2) + pooled_roi_feat = pooled_roi_feat.view(-1, + pooled_roi_feat.shape[-2], + pooled_roi_feat.shape[-1]) + pooled_roi_feat[:, :, 0:3] = rotation_3d_in_axis( + pooled_roi_feat[:, :, 0:3], + -(rois.view(-1, rois.shape[-1])[:, 6]), + axis=2) + pooled_roi_feat[pooled_empty_flag.view(-1) > 0] = 0 + + return pooled_roi_feat diff --git a/mmdet3d/models/segmentors/base.py b/mmdet3d/models/segmentors/base.py index 3cb83acce6..9913698337 100644 --- a/mmdet3d/models/segmentors/base.py +++ b/mmdet3d/models/segmentors/base.py @@ -1,10 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. +from os import path as osp + import mmcv import numpy as np import torch from mmcv.parallel import DataContainer as DC from mmcv.runner import auto_fp16 -from os import path as osp from mmdet3d.core import show_seg_result from mmseg.models.segmentors import BaseSegmentor @@ -80,7 +81,7 @@ def show_results(self, Args: data (list[dict]): Input points and the information of the sample. result (list[dict]): Prediction results. - palette (list[list[int]]] | np.ndarray | None): The palette of + palette (list[list[int]]] | np.ndarray): The palette of segmentation map. If None is given, random palette will be generated. Default: None out_dir (str): Output directory of visualization result. diff --git a/mmdet3d/models/segmentors/encoder_decoder.py b/mmdet3d/models/segmentors/encoder_decoder.py index e64dfc3930..196904ad1e 100644 --- a/mmdet3d/models/segmentors/encoder_decoder.py +++ b/mmdet3d/models/segmentors/encoder_decoder.py @@ -187,7 +187,7 @@ def _input_generation(coords, use_normalized_coord=False): """Generating model input. - Generate input by subtracting patch center and adding additional \ + Generate input by subtracting patch center and adding additional features. Currently support colors and normalized xyz as features. Args: @@ -195,7 +195,7 @@ def _input_generation(coords, patch_center (torch.Tensor): Center coordinate of the patch. coord_max (torch.Tensor): Max coordinate of all 3D points. feats (torch.Tensor): Features of sampled points of shape [S, C]. - use_normalized_coord (bool, optional): Whether to use normalized \ + use_normalized_coord (bool, optional): Whether to use normalized xyz as additional features. Defaults to False. Returns: @@ -233,17 +233,17 @@ def _sliding_patch_generation(self, block_size (float, optional): Size of a patch to sample. sample_rate (float, optional): Stride used in sliding patch. Defaults to 0.5. - use_normalized_coord (bool, optional): Whether to use normalized \ + use_normalized_coord (bool, optional): Whether to use normalized xyz as additional features. Defaults to False. eps (float, optional): A value added to patch boundary to guarantee - points coverage. Default 1e-3. + points coverage. Defaults to 1e-3. Returns: np.ndarray | np.ndarray: - - patch_points (torch.Tensor): Points of different patches of \ + - patch_points (torch.Tensor): Points of different patches of shape [K, N, 3+C]. - - patch_idxs (torch.Tensor): Index of each point in \ + - patch_idxs (torch.Tensor): Index of each point in `patch_points`, of shape [K, N]. """ device = points.device diff --git a/mmdet3d/models/utils/__init__.py b/mmdet3d/models/utils/__init__.py index d0639301d0..92a0499a8d 100644 --- a/mmdet3d/models/utils/__init__.py +++ b/mmdet3d/models/utils/__init__.py @@ -1,5 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. from .clip_sigmoid import clip_sigmoid +from .edge_indices import get_edge_indices +from .gen_keypoints import get_keypoints +from .handle_objs import filter_outside_objs, handle_proj_objs from .mlp import MLP -__all__ = ['clip_sigmoid', 'MLP'] +__all__ = [ + 'clip_sigmoid', 'MLP', 'get_edge_indices', 'filter_outside_objs', + 'handle_proj_objs', 'get_keypoints' +] diff --git a/mmdet3d/models/utils/clip_sigmoid.py b/mmdet3d/models/utils/clip_sigmoid.py index 3936d7de29..3afd4edbef 100644 --- a/mmdet3d/models/utils/clip_sigmoid.py +++ b/mmdet3d/models/utils/clip_sigmoid.py @@ -7,8 +7,8 @@ def clip_sigmoid(x, eps=1e-4): Args: x (torch.Tensor): Input feature map with the shape of [B, N, H, W]. - eps (float): Lower bound of the range to be clamped to. Defaults - to 1e-4. + eps (float, optional): Lower bound of the range to be clamped to. + Defaults to 1e-4. Returns: torch.Tensor: Feature map after sigmoid. diff --git a/mmdet3d/models/utils/edge_indices.py b/mmdet3d/models/utils/edge_indices.py new file mode 100644 index 0000000000..5dcb71feab --- /dev/null +++ b/mmdet3d/models/utils/edge_indices.py @@ -0,0 +1,88 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch + + +def get_edge_indices(img_metas, + downsample_ratio, + step=1, + pad_mode='default', + dtype=np.float32, + device='cpu'): + """Function to filter the objects label outside the image. + The edge_indices are generated using numpy on cpu rather + than on CUDA due to the latency issue. When batch size = 8, + this function with numpy array is ~8 times faster than that + with CUDA tensor (0.09s and 0.72s in 100 runs). + + Args: + img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + downsample_ratio (int): Downsample ratio of output feature, + step (int, optional): Step size used for generateing + edge indices. Default: 1. + pad_mode (str, optional): Padding mode during data pipeline. + Default: 'default'. + dtype (torch.dtype, optional): Dtype of edge indices tensor. + Default: np.float32. + device (str, optional): Device of edge indices tensor. + Default: 'cpu'. + + Returns: + list[Tensor]: Edge indices for each image in batch data. + """ + edge_indices_list = [] + for i in range(len(img_metas)): + img_shape = img_metas[i]['img_shape'] + pad_shape = img_metas[i]['pad_shape'] + h, w = img_shape[:2] + pad_h, pad_w = pad_shape + edge_indices = [] + + if pad_mode == 'default': + x_min = 0 + y_min = 0 + x_max = (w - 1) // downsample_ratio + y_max = (h - 1) // downsample_ratio + elif pad_mode == 'center': + x_min = np.ceil((pad_w - w) / 2 * downsample_ratio) + y_min = np.ceil((pad_h - h) / 2 * downsample_ratio) + x_max = x_min + w // downsample_ratio + y_max = y_min + h // downsample_ratio + else: + raise NotImplementedError + + # left + y = np.arange(y_min, y_max, step, dtype=dtype) + x = np.ones(len(y)) * x_min + + edge_indices_edge = np.stack((x, y), axis=1) + edge_indices.append(edge_indices_edge) + + # bottom + x = np.arange(x_min, x_max, step, dtype=dtype) + y = np.ones(len(x)) * y_max + + edge_indices_edge = np.stack((x, y), axis=1) + edge_indices.append(edge_indices_edge) + + # right + y = np.arange(y_max, y_min, -step, dtype=dtype) + x = np.ones(len(y)) * x_max + + edge_indices_edge = np.stack((x, y), axis=1) + edge_indices.append(edge_indices_edge) + + # top + x = np.arange(x_max, x_min, -step, dtype=dtype) + y = np.ones(len(x)) * y_min + + edge_indices_edge = np.stack((x, y), axis=1) + edge_indices.append(edge_indices_edge) + + edge_indices = \ + np.concatenate([index for index in edge_indices], axis=0) + edge_indices = torch.from_numpy(edge_indices).to(device).long() + edge_indices_list.append(edge_indices) + + return edge_indices_list diff --git a/mmdet3d/models/utils/gen_keypoints.py b/mmdet3d/models/utils/gen_keypoints.py new file mode 100644 index 0000000000..8c7909b89a --- /dev/null +++ b/mmdet3d/models/utils/gen_keypoints.py @@ -0,0 +1,80 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from mmdet3d.core.bbox import points_cam2img + + +def get_keypoints(gt_bboxes_3d_list, + centers2d_list, + img_metas, + use_local_coords=True): + """Function to filter the objects label outside the image. + + Args: + gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image, + shape (num_gt, 4). + centers2d_list (list[Tensor]): Projected 3D centers onto 2D image, + shape (num_gt, 2). + img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + use_local_coords (bool, optional): Wheher to use local coordinates + for keypoints. Default: True. + + Returns: + tuple[list[Tensor]]: It contains two elements, the first is the + keypoints for each projected 2D bbox in batch data. The second is + the visible mask of depth calculated by keypoints. + """ + + assert len(gt_bboxes_3d_list) == len(centers2d_list) + bs = len(gt_bboxes_3d_list) + keypoints2d_list = [] + keypoints_depth_mask_list = [] + + for i in range(bs): + gt_bboxes_3d = gt_bboxes_3d_list[i] + centers2d = centers2d_list[i] + img_shape = img_metas[i]['img_shape'] + cam2img = img_metas[i]['cam2img'] + h, w = img_shape[:2] + # (N, 8, 3) + corners3d = gt_bboxes_3d.corners + top_centers3d = torch.mean(corners3d[:, [0, 1, 4, 5], :], dim=1) + bot_centers3d = torch.mean(corners3d[:, [2, 3, 6, 7], :], dim=1) + # (N, 2, 3) + top_bot_centers3d = torch.stack((top_centers3d, bot_centers3d), dim=1) + keypoints3d = torch.cat((corners3d, top_bot_centers3d), dim=1) + # (N, 10, 2) + keypoints2d = points_cam2img(keypoints3d, cam2img) + + # keypoints mask: keypoints must be inside + # the image and in front of the camera + keypoints_x_visible = (keypoints2d[..., 0] >= 0) & ( + keypoints2d[..., 0] <= w - 1) + keypoints_y_visible = (keypoints2d[..., 1] >= 0) & ( + keypoints2d[..., 1] <= h - 1) + keypoints_z_visible = (keypoints3d[..., -1] > 0) + + # (N, 1O) + keypoints_visible = keypoints_x_visible & \ + keypoints_y_visible & keypoints_z_visible + # center, diag-02, diag-13 + keypoints_depth_valid = torch.stack( + (keypoints_visible[:, [8, 9]].all(dim=1), + keypoints_visible[:, [0, 3, 5, 6]].all(dim=1), + keypoints_visible[:, [1, 2, 4, 7]].all(dim=1)), + dim=1) + keypoints_visible = keypoints_visible.float() + + if use_local_coords: + keypoints2d = torch.cat((keypoints2d - centers2d.unsqueeze(1), + keypoints_visible.unsqueeze(-1)), + dim=2) + else: + keypoints2d = torch.cat( + (keypoints2d, keypoints_visible.unsqueeze(-1)), dim=2) + + keypoints2d_list.append(keypoints2d) + keypoints_depth_mask_list.append(keypoints_depth_valid) + + return (keypoints2d_list, keypoints_depth_mask_list) diff --git a/mmdet3d/models/utils/handle_objs.py b/mmdet3d/models/utils/handle_objs.py new file mode 100644 index 0000000000..25fd793a3a --- /dev/null +++ b/mmdet3d/models/utils/handle_objs.py @@ -0,0 +1,135 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + + +def filter_outside_objs(gt_bboxes_list, gt_labels_list, gt_bboxes_3d_list, + gt_labels_3d_list, centers2d_list, img_metas): + """Function to filter the objects label outside the image. + + Args: + gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image, + each has shape (num_gt, 4). + gt_labels_list (list[Tensor]): Ground truth labels of each box, + each has shape (num_gt,). + gt_bboxes_3d_list (list[Tensor]): 3D Ground truth bboxes of each + image, each has shape (num_gt, bbox_code_size). + gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of each + box, each has shape (num_gt,). + centers2d_list (list[Tensor]): Projected 3D centers onto 2D image, + each has shape (num_gt, 2). + img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + """ + bs = len(centers2d_list) + + for i in range(bs): + centers2d = centers2d_list[i].clone() + img_shape = img_metas[i]['img_shape'] + keep_inds = (centers2d[:, 0] > 0) & \ + (centers2d[:, 0] < img_shape[1]) & \ + (centers2d[:, 1] > 0) & \ + (centers2d[:, 1] < img_shape[0]) + centers2d_list[i] = centers2d[keep_inds] + gt_labels_list[i] = gt_labels_list[i][keep_inds] + gt_bboxes_list[i] = gt_bboxes_list[i][keep_inds] + gt_bboxes_3d_list[i].tensor = gt_bboxes_3d_list[i].tensor[keep_inds] + gt_labels_3d_list[i] = gt_labels_3d_list[i][keep_inds] + + +def get_centers2d_target(centers2d, centers, img_shape): + """Function to get target centers2d. + + Args: + centers2d (Tensor): Projected 3D centers onto 2D images. + centers (Tensor): Centers of 2d gt bboxes. + img_shape (tuple): Resized image shape. + + Returns: + torch.Tensor: Projected 3D centers (centers2D) target. + """ + N = centers2d.shape[0] + h, w = img_shape[:2] + valid_intersects = centers2d.new_zeros((N, 2)) + a = (centers[:, 1] - centers2d[:, 1]) / (centers[:, 0] - centers2d[:, 0]) + b = centers[:, 1] - a * centers[:, 0] + left_y = b + right_y = (w - 1) * a + b + top_x = -b / a + bottom_x = (h - 1 - b) / a + + left_coors = torch.stack((left_y.new_zeros(N, ), left_y), dim=1) + right_coors = torch.stack((right_y.new_full((N, ), w - 1), right_y), dim=1) + top_coors = torch.stack((top_x, top_x.new_zeros(N, )), dim=1) + bottom_coors = torch.stack((bottom_x, bottom_x.new_full((N, ), h - 1)), + dim=1) + + intersects = torch.stack( + [left_coors, right_coors, top_coors, bottom_coors], dim=1) + intersects_x = intersects[:, :, 0] + intersects_y = intersects[:, :, 1] + inds = (intersects_x >= 0) & (intersects_x <= + w - 1) & (intersects_y >= 0) & ( + intersects_y <= h - 1) + valid_intersects = intersects[inds].reshape(N, 2, 2) + dist = torch.norm(valid_intersects - centers2d.unsqueeze(1), dim=2) + min_idx = torch.argmin(dist, dim=1) + + min_idx = min_idx.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, 2) + centers2d_target = valid_intersects.gather(dim=1, index=min_idx).squeeze(1) + + return centers2d_target + + +def handle_proj_objs(centers2d_list, gt_bboxes_list, img_metas): + """Function to handle projected object centers2d, generate target + centers2d. + + Args: + gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image, + shape (num_gt, 4). + centers2d_list (list[Tensor]): Projected 3D centers onto 2D image, + shape (num_gt, 2). + img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + + Returns: + tuple[list[Tensor]]: It contains three elements. The first is the + target centers2d after handling the truncated objects. The second + is the offsets between target centers2d and round int dtype + centers2d,and the last is the truncation mask for each object in + batch data. + """ + bs = len(centers2d_list) + centers2d_target_list = [] + trunc_mask_list = [] + offsets2d_list = [] + # for now, only pad mode that img is padded by right and + # bottom side is supported. + for i in range(bs): + centers2d = centers2d_list[i] + gt_bbox = gt_bboxes_list[i] + img_shape = img_metas[i]['img_shape'] + centers2d_target = centers2d.clone() + inside_inds = (centers2d[:, 0] > 0) & \ + (centers2d[:, 0] < img_shape[1]) & \ + (centers2d[:, 1] > 0) & \ + (centers2d[:, 1] < img_shape[0]) + outside_inds = ~inside_inds + + # if there are outside objects + if outside_inds.any(): + centers = (gt_bbox[:, :2] + gt_bbox[:, 2:]) / 2 + outside_centers2d = centers2d[outside_inds] + match_centers = centers[outside_inds] + target_outside_centers2d = get_centers2d_target( + outside_centers2d, match_centers, img_shape) + centers2d_target[outside_inds] = target_outside_centers2d + + offsets2d = centers2d - centers2d_target.round().int() + trunc_mask = outside_inds + + centers2d_target_list.append(centers2d_target) + trunc_mask_list.append(trunc_mask) + offsets2d_list.append(offsets2d) + + return (centers2d_target_list, offsets2d_list, trunc_mask_list) diff --git a/mmdet3d/models/utils/mlp.py b/mmdet3d/models/utils/mlp.py index 55ea3885e4..0b499bb46f 100644 --- a/mmdet3d/models/utils/mlp.py +++ b/mmdet3d/models/utils/mlp.py @@ -10,15 +10,15 @@ class MLP(BaseModule): Pass features (B, C, N) through an MLP. Args: - in_channels (int): Number of channels of input features. + in_channels (int, optional): Number of channels of input features. Default: 18. - conv_channels (tuple[int]): Out channels of the convolution. + conv_channels (tuple[int], optional): Out channels of the convolution. Default: (256, 256). - conv_cfg (dict): Config of convolution. + conv_cfg (dict, optional): Config of convolution. Default: dict(type='Conv1d'). - norm_cfg (dict): Config of normalization. + norm_cfg (dict, optional): Config of normalization. Default: dict(type='BN1d'). - act_cfg (dict): Config of activation. + act_cfg (dict, optional): Config of activation. Default: dict(type='ReLU'). """ diff --git a/mmdet3d/models/voxel_encoders/pillar_encoder.py b/mmdet3d/models/voxel_encoders/pillar_encoder.py index 80d9498660..c91cf282ad 100644 --- a/mmdet3d/models/voxel_encoders/pillar_encoder.py +++ b/mmdet3d/models/voxel_encoders/pillar_encoder.py @@ -15,7 +15,6 @@ class PillarFeatureNet(nn.Module): The network prepares the pillar features and performs forward pass through PFNLayers. - Args: in_channels (int, optional): Number of input features, either x, y, z or x, y, z, r. Defaults to 4. @@ -33,7 +32,7 @@ class PillarFeatureNet(nn.Module): Defaults to dict(type='BN1d', eps=1e-3, momentum=0.01). mode (str, optional): The mode to gather point features. Options are 'max' or 'avg'. Defaults to 'max'. - legacy (bool): Whether to use the new behavior or + legacy (bool, optional): Whether to use the new behavior or the original behavior. Defaults to True. """ @@ -54,7 +53,7 @@ def __init__(self, if with_cluster_center: in_channels += 3 if with_voxel_center: - in_channels += 2 + in_channels += 3 if with_distance: in_channels += 1 self._with_distance = with_distance @@ -84,8 +83,10 @@ def __init__(self, # Need pillar (voxel) size and x/y offset in order to calculate offset self.vx = voxel_size[0] self.vy = voxel_size[1] + self.vz = voxel_size[2] self.x_offset = self.vx / 2 + point_cloud_range[0] self.y_offset = self.vy / 2 + point_cloud_range[1] + self.z_offset = self.vz / 2 + point_cloud_range[2] self.point_cloud_range = point_cloud_range @force_fp32(out_fp16=True) @@ -97,7 +98,6 @@ def forward(self, features, num_points, coors): (N, M, C). num_points (torch.Tensor): Number of points in each pillar. coors (torch.Tensor): Coordinates of each voxel. - Returns: torch.Tensor: Features of pillars. """ @@ -114,21 +114,27 @@ def forward(self, features, num_points, coors): dtype = features.dtype if self._with_voxel_center: if not self.legacy: - f_center = torch.zeros_like(features[:, :, :2]) + f_center = torch.zeros_like(features[:, :, :3]) f_center[:, :, 0] = features[:, :, 0] - ( coors[:, 3].to(dtype).unsqueeze(1) * self.vx + self.x_offset) f_center[:, :, 1] = features[:, :, 1] - ( coors[:, 2].to(dtype).unsqueeze(1) * self.vy + self.y_offset) + f_center[:, :, 2] = features[:, :, 2] - ( + coors[:, 1].to(dtype).unsqueeze(1) * self.vz + + self.z_offset) else: - f_center = features[:, :, :2] + f_center = features[:, :, :3] f_center[:, :, 0] = f_center[:, :, 0] - ( coors[:, 3].type_as(features).unsqueeze(1) * self.vx + self.x_offset) f_center[:, :, 1] = f_center[:, :, 1] - ( coors[:, 2].type_as(features).unsqueeze(1) * self.vy + self.y_offset) + f_center[:, :, 2] = f_center[:, :, 2] - ( + coors[:, 1].type_as(features).unsqueeze(1) * self.vz + + self.z_offset) features_ls.append(f_center) if self._with_distance: @@ -177,6 +183,8 @@ class DynamicPillarFeatureNet(PillarFeatureNet): Defaults to dict(type='BN1d', eps=1e-3, momentum=0.01). mode (str, optional): The mode to gather point features. Options are 'max' or 'avg'. Defaults to 'max'. + legacy (bool, optional): Whether to use the new behavior or + the original behavior. Defaults to True. """ def __init__(self, @@ -188,7 +196,8 @@ def __init__(self, voxel_size=(0.2, 0.2, 4), point_cloud_range=(0, -40, -3, 70.4, 40, 1), norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), - mode='max'): + mode='max', + legacy=True): super(DynamicPillarFeatureNet, self).__init__( in_channels, feat_channels, @@ -198,7 +207,8 @@ def __init__(self, voxel_size=voxel_size, point_cloud_range=point_cloud_range, norm_cfg=norm_cfg, - mode=mode) + mode=mode, + legacy=legacy) self.fp16_enabled = False feat_channels = [self.in_channels] + list(feat_channels) pfn_layers = [] @@ -233,7 +243,7 @@ def map_voxel_center_to_point(self, pts_coors, voxel_mean, voxel_coors): Returns: torch.Tensor: Corresponding voxel centers of each points, shape - (M, C), where M is the numver of points. + (M, C), where M is the number of points. """ # Step 1: scatter voxel into canvas # Calculate necessary things for canvas creation diff --git a/mmdet3d/models/voxel_encoders/utils.py b/mmdet3d/models/voxel_encoders/utils.py index 5055b06196..8c54fc2d16 100644 --- a/mmdet3d/models/voxel_encoders/utils.py +++ b/mmdet3d/models/voxel_encoders/utils.py @@ -113,11 +113,12 @@ class PFNLayer(nn.Module): Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. - norm_cfg (dict): Config dict of normalization layers - last_layer (bool): If last_layer, there is no concatenation of - features. - mode (str): Pooling model to gather features inside voxels. - Default to 'max'. + norm_cfg (dict, optional): Config dict of normalization layers. + Defaults to dict(type='BN1d', eps=1e-3, momentum=0.01). + last_layer (bool, optional): If last_layer, there is no + concatenation of features. Defaults to False. + mode (str, optional): Pooling model to gather features inside voxels. + Defaults to 'max'. """ def __init__(self, diff --git a/mmdet3d/models/voxel_encoders/voxel_encoder.py b/mmdet3d/models/voxel_encoders/voxel_encoder.py index fcc1283e09..33978ddc94 100644 --- a/mmdet3d/models/voxel_encoders/voxel_encoder.py +++ b/mmdet3d/models/voxel_encoders/voxel_encoder.py @@ -17,7 +17,7 @@ class HardSimpleVFE(nn.Module): It simply averages the values of points in a voxel. Args: - num_features (int): Number of features to use. Default: 4. + num_features (int, optional): Number of features to use. Default: 4. """ def __init__(self, num_features=4): @@ -93,25 +93,27 @@ class DynamicVFE(nn.Module): The number of points inside the voxel varies. Args: - in_channels (int): Input channels of VFE. Defaults to 4. - feat_channels (list(int)): Channels of features in VFE. - with_distance (bool): Whether to use the L2 distance of points to the - origin point. Default False. - with_cluster_center (bool): Whether to use the distance to cluster - center of points inside a voxel. Default to False. - with_voxel_center (bool): Whether to use the distance to center of - voxel for each points inside a voxel. Default to False. - voxel_size (tuple[float]): Size of a single voxel. Default to - (0.2, 0.2, 4). - point_cloud_range (tuple[float]): The range of points or voxels. - Default to (0, -40, -3, 70.4, 40, 1). - norm_cfg (dict): Config dict of normalization layers. - mode (str): The mode when pooling features of points inside a voxel. - Available options include 'max' and 'avg'. Default to 'max'. - fusion_layer (dict | None): The config dict of fusion layer used in - multi-modal detectors. Default to None. - return_point_feats (bool): Whether to return the features of each - points. Default to False. + in_channels (int, optional): Input channels of VFE. Defaults to 4. + feat_channels (list(int), optional): Channels of features in VFE. + with_distance (bool, optional): Whether to use the L2 distance of + points to the origin point. Defaults to False. + with_cluster_center (bool, optional): Whether to use the distance + to cluster center of points inside a voxel. Defaults to False. + with_voxel_center (bool, optional): Whether to use the distance + to center of voxel for each points inside a voxel. + Defaults to False. + voxel_size (tuple[float], optional): Size of a single voxel. + Defaults to (0.2, 0.2, 4). + point_cloud_range (tuple[float], optional): The range of points + or voxels. Defaults to (0, -40, -3, 70.4, 40, 1). + norm_cfg (dict, optional): Config dict of normalization layers. + mode (str, optional): The mode when pooling features of points + inside a voxel. Available options include 'max' and 'avg'. + Defaults to 'max'. + fusion_layer (dict, optional): The config dict of fusion + layer used in multi-modal detectors. Defaults to None. + return_point_feats (bool, optional): Whether to return the features + of each points. Defaults to False. """ def __init__(self, @@ -230,7 +232,7 @@ def forward(self, coors (torch.Tensor): Coordinates of voxels, shape is Nx(1+NDim). points (list[torch.Tensor], optional): Raw points used to guide the multi-modality fusion. Defaults to None. - img_feats (list[torch.Tensor], optional): Image fetures used for + img_feats (list[torch.Tensor], optional): Image features used for multi-modality fusion. Defaults to None. img_metas (dict, optional): [description]. Defaults to None. @@ -292,25 +294,26 @@ class HardVFE(nn.Module): image feature into voxel features in a point-wise manner. Args: - in_channels (int): Input channels of VFE. Defaults to 4. - feat_channels (list(int)): Channels of features in VFE. - with_distance (bool): Whether to use the L2 distance of points to the - origin point. Default False. - with_cluster_center (bool): Whether to use the distance to cluster - center of points inside a voxel. Default to False. - with_voxel_center (bool): Whether to use the distance to center of - voxel for each points inside a voxel. Default to False. - voxel_size (tuple[float]): Size of a single voxel. Default to - (0.2, 0.2, 4). - point_cloud_range (tuple[float]): The range of points or voxels. - Default to (0, -40, -3, 70.4, 40, 1). - norm_cfg (dict): Config dict of normalization layers. - mode (str): The mode when pooling features of points inside a voxel. - Available options include 'max' and 'avg'. Default to 'max'. - fusion_layer (dict | None): The config dict of fusion layer used in - multi-modal detectors. Default to None. - return_point_feats (bool): Whether to return the features of each - points. Default to False. + in_channels (int, optional): Input channels of VFE. Defaults to 4. + feat_channels (list(int), optional): Channels of features in VFE. + with_distance (bool, optional): Whether to use the L2 distance + of points to the origin point. Defaults to False. + with_cluster_center (bool, optional): Whether to use the distance + to cluster center of points inside a voxel. Defaults to False. + with_voxel_center (bool, optional): Whether to use the distance to + center of voxel for each points inside a voxel. Defaults to False. + voxel_size (tuple[float], optional): Size of a single voxel. + Defaults to (0.2, 0.2, 4). + point_cloud_range (tuple[float], optional): The range of points + or voxels. Defaults to (0, -40, -3, 70.4, 40, 1). + norm_cfg (dict, optional): Config dict of normalization layers. + mode (str, optional): The mode when pooling features of points inside a + voxel. Available options include 'max' and 'avg'. + Defaults to 'max'. + fusion_layer (dict, optional): The config dict of fusion layer + used in multi-modal detectors. Defaults to None. + return_point_feats (bool, optional): Whether to return the + features of each points. Defaults to False. """ def __init__(self, @@ -394,7 +397,7 @@ def forward(self, features (torch.Tensor): Features of voxels, shape is MxNxC. num_points (torch.Tensor): Number of points in each voxel. coors (torch.Tensor): Coordinates of voxels, shape is Mx(1+NDim). - img_feats (list[torch.Tensor], optional): Image fetures used for + img_feats (list[torch.Tensor], optional): Image features used for multi-modality fusion. Defaults to None. img_metas (dict, optional): [description]. Defaults to None. diff --git a/mmdet3d/ops/__init__.py b/mmdet3d/ops/__init__.py index bf9988abe8..1530f963b3 100644 --- a/mmdet3d/ops/__init__.py +++ b/mmdet3d/ops/__init__.py @@ -4,6 +4,7 @@ sigmoid_focal_loss) from .ball_query import ball_query +from .dgcnn_modules import DGCNNFAModule, DGCNNFPModule, DGCNNGFModule from .furthest_point_sample import (Points_Sampler, furthest_point_sample, furthest_point_sample_with_dist) from .gather_points import gather_points @@ -17,8 +18,9 @@ PAConvSAModule, PAConvSAModuleMSG, PointFPModule, PointSAModule, PointSAModuleMSG, build_sa_module) -from .roiaware_pool3d import (RoIAwarePool3d, points_in_boxes_batch, - points_in_boxes_cpu, points_in_boxes_gpu) +from .roiaware_pool3d import (RoIAwarePool3d, points_in_boxes_all, + points_in_boxes_cpu, points_in_boxes_part) +from .roipoint_pool3d import RoIPointPool3d from .sparse_block import (SparseBasicBlock, SparseBottleneck, make_sparse_convmodule) from .voxel import DynamicScatter, Voxelization, dynamic_scatter, voxelization @@ -29,13 +31,14 @@ 'NaiveSyncBatchNorm2d', 'batched_nms', 'Voxelization', 'voxelization', 'dynamic_scatter', 'DynamicScatter', 'sigmoid_focal_loss', 'SigmoidFocalLoss', 'SparseBasicBlock', 'SparseBottleneck', - 'RoIAwarePool3d', 'points_in_boxes_gpu', 'points_in_boxes_cpu', + 'RoIAwarePool3d', 'points_in_boxes_part', 'points_in_boxes_cpu', 'make_sparse_convmodule', 'ball_query', 'knn', 'furthest_point_sample', 'furthest_point_sample_with_dist', 'three_interpolate', 'three_nn', 'gather_points', 'grouping_operation', 'group_points', 'GroupAll', 'QueryAndGroup', 'PointSAModule', 'PointSAModuleMSG', 'PointFPModule', - 'points_in_boxes_batch', 'get_compiler_version', 'assign_score_withk', - 'get_compiling_cuda_version', 'Points_Sampler', 'build_sa_module', - 'PAConv', 'PAConvCUDA', 'PAConvSAModuleMSG', 'PAConvSAModule', - 'PAConvCUDASAModule', 'PAConvCUDASAModuleMSG' + 'DGCNNFPModule', 'DGCNNGFModule', 'DGCNNFAModule', 'points_in_boxes_all', + 'get_compiler_version', 'assign_score_withk', 'get_compiling_cuda_version', + 'Points_Sampler', 'build_sa_module', 'PAConv', 'PAConvCUDA', + 'PAConvSAModuleMSG', 'PAConvSAModule', 'PAConvCUDASAModule', + 'PAConvCUDASAModuleMSG', 'RoIPointPool3d' ] diff --git a/mmdet3d/ops/ball_query/__init__.py b/mmdet3d/ops/ball_query/__init__.py index 5e7937cad9..2e4704259b 100644 --- a/mmdet3d/ops/ball_query/__init__.py +++ b/mmdet3d/ops/ball_query/__init__.py @@ -1,3 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. from .ball_query import ball_query __all__ = ['ball_query'] diff --git a/mmdet3d/ops/ball_query/ball_query.py b/mmdet3d/ops/ball_query/ball_query.py index bae81ad29c..efbe89b4cb 100644 --- a/mmdet3d/ops/ball_query/ball_query.py +++ b/mmdet3d/ops/ball_query/ball_query.py @@ -1,3 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. import torch from torch.autograd import Function @@ -23,7 +24,7 @@ def forward(ctx, min_radius: float, max_radius: float, sample_num: int, center_xyz (Tensor): (B, npoint, 3) centers of the ball query. Returns: - Tensor: (B, npoint, nsample) tensor with the indicies of + Tensor: (B, npoint, nsample) tensor with the indices of the features that form the query balls. """ assert center_xyz.is_contiguous() diff --git a/mmdet3d/ops/dgcnn_modules/__init__.py b/mmdet3d/ops/dgcnn_modules/__init__.py new file mode 100644 index 0000000000..67beb0907f --- /dev/null +++ b/mmdet3d/ops/dgcnn_modules/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .dgcnn_fa_module import DGCNNFAModule +from .dgcnn_fp_module import DGCNNFPModule +from .dgcnn_gf_module import DGCNNGFModule + +__all__ = ['DGCNNFAModule', 'DGCNNFPModule', 'DGCNNGFModule'] diff --git a/mmdet3d/ops/dgcnn_modules/dgcnn_fa_module.py b/mmdet3d/ops/dgcnn_modules/dgcnn_fa_module.py new file mode 100644 index 0000000000..b0975e691b --- /dev/null +++ b/mmdet3d/ops/dgcnn_modules/dgcnn_fa_module.py @@ -0,0 +1,68 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from mmcv.cnn import ConvModule +from mmcv.runner import BaseModule, force_fp32 +from torch import nn as nn + + +class DGCNNFAModule(BaseModule): + """Point feature aggregation module used in DGCNN. + + Aggregate all the features of points. + + Args: + mlp_channels (list[int]): List of mlp channels. + norm_cfg (dict, optional): Type of normalization method. + Defaults to dict(type='BN1d'). + act_cfg (dict, optional): Type of activation method. + Defaults to dict(type='ReLU'). + init_cfg (dict, optional): Initialization config. Defaults to None. + """ + + def __init__(self, + mlp_channels, + norm_cfg=dict(type='BN1d'), + act_cfg=dict(type='ReLU'), + init_cfg=None): + super().__init__(init_cfg=init_cfg) + self.fp16_enabled = False + self.mlps = nn.Sequential() + for i in range(len(mlp_channels) - 1): + self.mlps.add_module( + f'layer{i}', + ConvModule( + mlp_channels[i], + mlp_channels[i + 1], + kernel_size=(1, ), + stride=(1, ), + conv_cfg=dict(type='Conv1d'), + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + @force_fp32() + def forward(self, points): + """forward. + + Args: + points (List[Tensor]): tensor of the features to be aggregated. + + Returns: + Tensor: (B, N, M) M = mlp[-1], tensor of the output points. + """ + + if len(points) > 1: + new_points = torch.cat(points[1:], dim=-1) + new_points = new_points.transpose(1, 2).contiguous() # (B, C, N) + new_points_copy = new_points + + new_points = self.mlps(new_points) + + new_fa_points = new_points.max(dim=-1, keepdim=True)[0] + new_fa_points = new_fa_points.repeat(1, 1, new_points.shape[-1]) + + new_points = torch.cat([new_fa_points, new_points_copy], dim=1) + new_points = new_points.transpose(1, 2).contiguous() + else: + new_points = points + + return new_points diff --git a/mmdet3d/ops/dgcnn_modules/dgcnn_fp_module.py b/mmdet3d/ops/dgcnn_modules/dgcnn_fp_module.py new file mode 100644 index 0000000000..c871721bc1 --- /dev/null +++ b/mmdet3d/ops/dgcnn_modules/dgcnn_fp_module.py @@ -0,0 +1,59 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.cnn import ConvModule +from mmcv.runner import BaseModule, force_fp32 +from torch import nn as nn + + +class DGCNNFPModule(BaseModule): + """Point feature propagation module used in DGCNN. + + Propagate the features from one set to another. + + Args: + mlp_channels (list[int]): List of mlp channels. + norm_cfg (dict, optional): Type of activation method. + Defaults to dict(type='BN1d'). + act_cfg (dict, optional): Type of activation method. + Defaults to dict(type='ReLU'). + init_cfg (dict, optional): Initialization config. Defaults to None. + """ + + def __init__(self, + mlp_channels, + norm_cfg=dict(type='BN1d'), + act_cfg=dict(type='ReLU'), + init_cfg=None): + super().__init__(init_cfg=init_cfg) + self.fp16_enabled = False + self.mlps = nn.Sequential() + for i in range(len(mlp_channels) - 1): + self.mlps.add_module( + f'layer{i}', + ConvModule( + mlp_channels[i], + mlp_channels[i + 1], + kernel_size=(1, ), + stride=(1, ), + conv_cfg=dict(type='Conv1d'), + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + @force_fp32() + def forward(self, points): + """forward. + + Args: + points (Tensor): (B, N, C) tensor of the input points. + + Returns: + Tensor: (B, N, M) M = mlp[-1], tensor of the new points. + """ + + if points is not None: + new_points = points.transpose(1, 2).contiguous() # (B, C, N) + new_points = self.mlps(new_points) + new_points = new_points.transpose(1, 2).contiguous() + else: + new_points = points + + return new_points diff --git a/mmdet3d/ops/dgcnn_modules/dgcnn_gf_module.py b/mmdet3d/ops/dgcnn_modules/dgcnn_gf_module.py new file mode 100644 index 0000000000..e317ccd086 --- /dev/null +++ b/mmdet3d/ops/dgcnn_modules/dgcnn_gf_module.py @@ -0,0 +1,222 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from mmcv.cnn import ConvModule +from torch import nn as nn +from torch.nn import functional as F + +from ..group_points import GroupAll, QueryAndGroup, grouping_operation + + +class BaseDGCNNGFModule(nn.Module): + """Base module for point graph feature module used in DGCNN. + + Args: + radii (list[float]): List of radius in each knn or ball query. + sample_nums (list[int]): Number of samples in each knn or ball query. + mlp_channels (list[list[int]]): Specify of the dgcnn before + the global pooling for each graph feature module. + knn_modes (list[str], optional): Type of KNN method, valid mode + ['F-KNN', 'D-KNN'], Defaults to ['F-KNN']. + dilated_group (bool, optional): Whether to use dilated ball query. + Defaults to False. + use_xyz (bool, optional): Whether to use xyz as point features. + Defaults to True. + pool_mode (str, optional): Type of pooling method. Defaults to 'max'. + normalize_xyz (bool, optional): If ball query, whether to normalize + local XYZ with radius. Defaults to False. + grouper_return_grouped_xyz (bool, optional): Whether to return grouped + xyz in `QueryAndGroup`. Defaults to False. + grouper_return_grouped_idx (bool, optional): Whether to return grouped + idx in `QueryAndGroup`. Defaults to False. + """ + + def __init__(self, + radii, + sample_nums, + mlp_channels, + knn_modes=['F-KNN'], + dilated_group=False, + use_xyz=True, + pool_mode='max', + normalize_xyz=False, + grouper_return_grouped_xyz=False, + grouper_return_grouped_idx=False): + super(BaseDGCNNGFModule, self).__init__() + + assert len(sample_nums) == len( + mlp_channels + ), 'Num_samples and mlp_channels should have the same length.' + assert pool_mode in ['max', 'avg' + ], "Pool_mode should be one of ['max', 'avg']." + assert isinstance(knn_modes, list) or isinstance( + knn_modes, tuple), 'The type of knn_modes should be list or tuple.' + + if isinstance(mlp_channels, tuple): + mlp_channels = list(map(list, mlp_channels)) + self.mlp_channels = mlp_channels + + self.pool_mode = pool_mode + self.groupers = nn.ModuleList() + self.mlps = nn.ModuleList() + self.knn_modes = knn_modes + + for i in range(len(sample_nums)): + sample_num = sample_nums[i] + if sample_num is not None: + if self.knn_modes[i] == 'D-KNN': + grouper = QueryAndGroup( + radii[i], + sample_num, + use_xyz=use_xyz, + normalize_xyz=normalize_xyz, + return_grouped_xyz=grouper_return_grouped_xyz, + return_grouped_idx=True) + else: + grouper = QueryAndGroup( + radii[i], + sample_num, + use_xyz=use_xyz, + normalize_xyz=normalize_xyz, + return_grouped_xyz=grouper_return_grouped_xyz, + return_grouped_idx=grouper_return_grouped_idx) + else: + grouper = GroupAll(use_xyz) + self.groupers.append(grouper) + + def _pool_features(self, features): + """Perform feature aggregation using pooling operation. + + Args: + features (torch.Tensor): (B, C, N, K) + Features of locally grouped points before pooling. + + Returns: + torch.Tensor: (B, C, N) + Pooled features aggregating local information. + """ + if self.pool_mode == 'max': + # (B, C, N, 1) + new_features = F.max_pool2d( + features, kernel_size=[1, features.size(3)]) + elif self.pool_mode == 'avg': + # (B, C, N, 1) + new_features = F.avg_pool2d( + features, kernel_size=[1, features.size(3)]) + else: + raise NotImplementedError + + return new_features.squeeze(-1).contiguous() + + def forward(self, points): + """forward. + + Args: + points (Tensor): (B, N, C) input points. + + Returns: + List[Tensor]: (B, N, C1) new points generated from each graph + feature module. + """ + new_points_list = [points] + + for i in range(len(self.groupers)): + + new_points = new_points_list[i] + new_points_trans = new_points.transpose( + 1, 2).contiguous() # (B, C, N) + + if self.knn_modes[i] == 'D-KNN': + # (B, N, C) -> (B, N, K) + idx = self.groupers[i](new_points[..., -3:].contiguous(), + new_points[..., -3:].contiguous())[-1] + + grouped_results = grouping_operation( + new_points_trans, idx) # (B, C, N) -> (B, C, N, K) + grouped_results -= new_points_trans.unsqueeze(-1) + else: + grouped_results = self.groupers[i]( + new_points, new_points) # (B, N, C) -> (B, C, N, K) + + new_points = new_points_trans.unsqueeze(-1).repeat( + 1, 1, 1, grouped_results.shape[-1]) + new_points = torch.cat([grouped_results, new_points], dim=1) + + # (B, mlp[-1], N, K) + new_points = self.mlps[i](new_points) + + # (B, mlp[-1], N) + new_points = self._pool_features(new_points) + new_points = new_points.transpose(1, 2).contiguous() + new_points_list.append(new_points) + + return new_points + + +class DGCNNGFModule(BaseDGCNNGFModule): + """Point graph feature module used in DGCNN. + + Args: + mlp_channels (list[int]): Specify of the dgcnn before + the global pooling for each graph feature module. + num_sample (int, optional): Number of samples in each knn or ball + query. Defaults to None. + knn_mode (str, optional): Type of KNN method, valid mode + ['F-KNN', 'D-KNN']. Defaults to 'F-KNN'. + radius (float, optional): Radius to group with. + Defaults to None. + dilated_group (bool, optional): Whether to use dilated ball query. + Defaults to False. + norm_cfg (dict, optional): Type of normalization method. + Defaults to dict(type='BN2d'). + act_cfg (dict, optional): Type of activation method. + Defaults to dict(type='ReLU'). + use_xyz (bool, optional): Whether to use xyz as point features. + Defaults to True. + pool_mode (str, optional): Type of pooling method. + Defaults to 'max'. + normalize_xyz (bool, optional): If ball query, whether to normalize + local XYZ with radius. Defaults to False. + bias (bool | str, optional): If specified as `auto`, it will be decided + by the norm_cfg. Bias will be set as True if `norm_cfg` is None, + otherwise False. Defaults to 'auto'. + """ + + def __init__(self, + mlp_channels, + num_sample=None, + knn_mode='F-KNN', + radius=None, + dilated_group=False, + norm_cfg=dict(type='BN2d'), + act_cfg=dict(type='ReLU'), + use_xyz=True, + pool_mode='max', + normalize_xyz=False, + bias='auto'): + super(DGCNNGFModule, self).__init__( + mlp_channels=[mlp_channels], + sample_nums=[num_sample], + knn_modes=[knn_mode], + radii=[radius], + use_xyz=use_xyz, + pool_mode=pool_mode, + normalize_xyz=normalize_xyz, + dilated_group=dilated_group) + + for i in range(len(self.mlp_channels)): + mlp_channel = self.mlp_channels[i] + + mlp = nn.Sequential() + for i in range(len(mlp_channel) - 1): + mlp.add_module( + f'layer{i}', + ConvModule( + mlp_channel[i], + mlp_channel[i + 1], + kernel_size=(1, 1), + stride=(1, 1), + conv_cfg=dict(type='Conv2d'), + norm_cfg=norm_cfg, + act_cfg=act_cfg, + bias=bias)) + self.mlps.append(mlp) diff --git a/mmdet3d/ops/furthest_point_sample/__init__.py b/mmdet3d/ops/furthest_point_sample/__init__.py index 06d57d974f..556af8a3ec 100644 --- a/mmdet3d/ops/furthest_point_sample/__init__.py +++ b/mmdet3d/ops/furthest_point_sample/__init__.py @@ -1,3 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. from .furthest_point_sample import (furthest_point_sample, furthest_point_sample_with_dist) from .points_sampler import Points_Sampler diff --git a/mmdet3d/ops/furthest_point_sample/furthest_point_sample.py b/mmdet3d/ops/furthest_point_sample/furthest_point_sample.py index af53317810..cdf293bfb5 100644 --- a/mmdet3d/ops/furthest_point_sample/furthest_point_sample.py +++ b/mmdet3d/ops/furthest_point_sample/furthest_point_sample.py @@ -1,3 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. import torch from torch.autograd import Function diff --git a/mmdet3d/ops/furthest_point_sample/points_sampler.py b/mmdet3d/ops/furthest_point_sample/points_sampler.py index 9a3bd2ae42..d1b6961a64 100644 --- a/mmdet3d/ops/furthest_point_sample/points_sampler.py +++ b/mmdet3d/ops/furthest_point_sample/points_sampler.py @@ -1,7 +1,9 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List + import torch from mmcv.runner import force_fp32 from torch import nn as nn -from typing import List from .furthest_point_sample import (furthest_point_sample, furthest_point_sample_with_dist) @@ -36,13 +38,13 @@ class Points_Sampler(nn.Module): Args: num_point (list[int]): Number of sample points. - fps_mod_list (list[str]: Type of FPS method, valid mod + fps_mod_list (list[str], optional): Type of FPS method, valid mod ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS']. F-FPS: using feature distances for FPS. D-FPS: using Euclidean distances of points for FPS. FS: using F-FPS and D-FPS simultaneously. - fps_sample_range_list (list[int]): Range of points to apply FPS. - Default: [-1]. + fps_sample_range_list (list[int], optional): + Range of points to apply FPS. Default: [-1]. """ def __init__(self, diff --git a/mmdet3d/ops/furthest_point_sample/utils.py b/mmdet3d/ops/furthest_point_sample/utils.py index 4ca235e13b..d0668d0349 100644 --- a/mmdet3d/ops/furthest_point_sample/utils.py +++ b/mmdet3d/ops/furthest_point_sample/utils.py @@ -1,3 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. import torch @@ -7,7 +8,7 @@ def calc_square_dist(point_feat_a, point_feat_b, norm=True): Args: point_feat_a (Tensor): (B, N, C) Feature vector of each point. point_feat_b (Tensor): (B, M, C) Feature vector of each point. - norm (Bool): Whether to normalize the distance. + norm (Bool, optional): Whether to normalize the distance. Default: True. Returns: diff --git a/mmdet3d/ops/gather_points/__init__.py b/mmdet3d/ops/gather_points/__init__.py index e8018ea879..51476dd72e 100644 --- a/mmdet3d/ops/gather_points/__init__.py +++ b/mmdet3d/ops/gather_points/__init__.py @@ -1,3 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. from .gather_points import gather_points __all__ = ['gather_points'] diff --git a/mmdet3d/ops/gather_points/gather_points.py b/mmdet3d/ops/gather_points/gather_points.py index 8de4958bb7..1c290a6e5f 100644 --- a/mmdet3d/ops/gather_points/gather_points.py +++ b/mmdet3d/ops/gather_points/gather_points.py @@ -1,3 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. import torch from torch.autograd import Function diff --git a/mmdet3d/ops/group_points/__init__.py b/mmdet3d/ops/group_points/__init__.py index 7d8040a460..a8bfefe831 100644 --- a/mmdet3d/ops/group_points/__init__.py +++ b/mmdet3d/ops/group_points/__init__.py @@ -1,3 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. from .group_points import GroupAll, QueryAndGroup, grouping_operation __all__ = ['QueryAndGroup', 'GroupAll', 'grouping_operation'] diff --git a/mmdet3d/ops/group_points/group_points.py b/mmdet3d/ops/group_points/group_points.py index 34c59bd800..0430aa65e6 100644 --- a/mmdet3d/ops/group_points/group_points.py +++ b/mmdet3d/ops/group_points/group_points.py @@ -1,3 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. + from typing import Tuple import torch @@ -16,22 +18,22 @@ class QueryAndGroup(nn.Module): Groups with a ball query of radius Args: - max_radius (float | None): The maximum radius of the balls. + max_radius (float): The maximum radius of the balls. If None is given, we will use kNN sampling instead of ball query. sample_num (int): Maximum number of features to gather in the ball. - min_radius (float): The minimum radius of the balls. - use_xyz (bool): Whether to use xyz. + min_radius (float, optional): The minimum radius of the balls. + Default: 0. + use_xyz (bool, optional): Whether to use xyz. Default: True. - return_grouped_xyz (bool): Whether to return grouped xyz. + return_grouped_xyz (bool, optional): Whether to return grouped xyz. Default: False. - normalize_xyz (bool): Whether to normalize xyz. + normalize_xyz (bool, optional): Whether to normalize xyz. Default: False. - uniform_sample (bool): Whether to sample uniformly. + uniform_sample (bool, optional): Whether to sample uniformly. Default: False - return_unique_cnt (bool): Whether to return the count of - unique samples. - Default: False. - return_grouped_idx (bool): Whether to return grouped idx. + return_unique_cnt (bool, optional): Whether to return the count of + unique samples. Default: False. + return_grouped_idx (bool, optional): Whether to return grouped idx. Default: False. """ diff --git a/mmdet3d/ops/interpolate/__init__.py b/mmdet3d/ops/interpolate/__init__.py index cf44d98390..7aded93c3d 100644 --- a/mmdet3d/ops/interpolate/__init__.py +++ b/mmdet3d/ops/interpolate/__init__.py @@ -1,3 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. from .three_interpolate import three_interpolate from .three_nn import three_nn diff --git a/mmdet3d/ops/interpolate/three_interpolate.py b/mmdet3d/ops/interpolate/three_interpolate.py index 233cb5d75b..6709656c96 100644 --- a/mmdet3d/ops/interpolate/three_interpolate.py +++ b/mmdet3d/ops/interpolate/three_interpolate.py @@ -1,6 +1,8 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple + import torch from torch.autograd import Function -from typing import Tuple from . import interpolate_ext diff --git a/mmdet3d/ops/interpolate/three_nn.py b/mmdet3d/ops/interpolate/three_nn.py index 6c174bd7b9..a7b65ac00c 100644 --- a/mmdet3d/ops/interpolate/three_nn.py +++ b/mmdet3d/ops/interpolate/three_nn.py @@ -1,6 +1,8 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple + import torch from torch.autograd import Function -from typing import Tuple from . import interpolate_ext diff --git a/mmdet3d/ops/iou3d/__init__.py b/mmdet3d/ops/iou3d/__init__.py index 9c35fb7af3..dd584e75dc 100644 --- a/mmdet3d/ops/iou3d/__init__.py +++ b/mmdet3d/ops/iou3d/__init__.py @@ -1,3 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. from .iou3d_utils import boxes_iou_bev, nms_gpu, nms_normal_gpu __all__ = ['boxes_iou_bev', 'nms_gpu', 'nms_normal_gpu'] diff --git a/mmdet3d/ops/iou3d/iou3d_utils.py b/mmdet3d/ops/iou3d/iou3d_utils.py index 6f36019e72..6ad6001972 100644 --- a/mmdet3d/ops/iou3d/iou3d_utils.py +++ b/mmdet3d/ops/iou3d/iou3d_utils.py @@ -1,10 +1,11 @@ +# Copyright (c) OpenMMLab. All rights reserved. import torch from . import iou3d_cuda def boxes_iou_bev(boxes_a, boxes_b): - """Calculate boxes IoU in the bird view. + """Calculate boxes IoU in the Bird's Eye View. Args: boxes_a (torch.Tensor): Input boxes a with shape (M, 5). @@ -22,24 +23,29 @@ def boxes_iou_bev(boxes_a, boxes_b): return ans_iou -def nms_gpu(boxes, scores, thresh, pre_maxsize=None, post_max_size=None): - """Nms function with gpu implementation. +def nms_gpu(boxes, scores, thresh, pre_max_size=None, post_max_size=None): + """NMS function GPU implementation (for BEV boxes). The overlap of two + boxes for IoU calculation is defined as the exact overlapping area of the + two boxes. In this function, one can also set `pre_max_size` and + `post_max_size`. Args: boxes (torch.Tensor): Input boxes with the shape of [N, 5] ([x1, y1, x2, y2, ry]). scores (torch.Tensor): Scores of boxes with the shape of [N]. thresh (int): Threshold. - pre_maxsize (int): Max size of boxes before nms. Default: None. - post_maxsize (int): Max size of boxes after nms. Default: None. + pre_max_size (int, optional): Max size of boxes before NMS. + Default: None. + post_max_size (int, optional): Max size of boxes after NMS. + Default: None. Returns: - torch.Tensor: Indexes after nms. + torch.Tensor: Indexes after NMS. """ order = scores.sort(0, descending=True)[1] - if pre_maxsize is not None: - order = order[:pre_maxsize] + if pre_max_size is not None: + order = order[:pre_max_size] boxes = boxes[order].contiguous() keep = torch.zeros(boxes.size(0), dtype=torch.long) @@ -51,12 +57,14 @@ def nms_gpu(boxes, scores, thresh, pre_maxsize=None, post_max_size=None): def nms_normal_gpu(boxes, scores, thresh): - """Normal non maximum suppression on GPU. + """Normal NMS function GPU implementation (for BEV boxes). The overlap of + two boxes for IoU calculation is defined as the exact overlapping area of + the two boxes WITH their yaw angle set to 0. Args: boxes (torch.Tensor): Input boxes with shape (N, 5). scores (torch.Tensor): Scores of predicted boxes with shape (N). - thresh (torch.Tensor): Threshold of non maximum suppression. + thresh (torch.Tensor): Threshold of NMS. Returns: torch.Tensor: Remaining indices with scores in descending order. diff --git a/mmdet3d/ops/iou3d/src/iou3d_kernel.cu b/mmdet3d/ops/iou3d/src/iou3d_kernel.cu index 861aea3c5a..a993384780 100644 --- a/mmdet3d/ops/iou3d/src/iou3d_kernel.cu +++ b/mmdet3d/ops/iou3d/src/iou3d_kernel.cu @@ -61,9 +61,9 @@ __device__ inline int check_in_box2d(const float *box, const Point &p) { angle_sin = sin(-box[4]); // rotate the point in the opposite direction of box float rot_x = - (p.x - center_x) * angle_cos + (p.y - center_y) * angle_sin + center_x; + (p.x - center_x) * angle_cos - (p.y - center_y) * angle_sin + center_x; float rot_y = - -(p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos + center_y; + (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos + center_y; #ifdef DEBUG printf("box: (%.3f, %.3f, %.3f, %.3f, %.3f)\n", box[0], box[1], box[2], box[3], box[4]); @@ -112,9 +112,9 @@ __device__ inline void rotate_around_center(const Point ¢er, const float angle_cos, const float angle_sin, Point &p) { float new_x = - (p.x - center.x) * angle_cos + (p.y - center.y) * angle_sin + center.x; + (p.x - center.x) * angle_cos - (p.y - center.y) * angle_sin + center.x; float new_y = - -(p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y; + (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y; p.set(new_x, new_y); } diff --git a/mmdet3d/ops/knn/__init__.py b/mmdet3d/ops/knn/__init__.py index c8cb712b09..d756a84d6f 100644 --- a/mmdet3d/ops/knn/__init__.py +++ b/mmdet3d/ops/knn/__init__.py @@ -1,3 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. from .knn import knn __all__ = ['knn'] diff --git a/mmdet3d/ops/knn/knn.py b/mmdet3d/ops/knn/knn.py index f05bc75f88..03fbe1f811 100644 --- a/mmdet3d/ops/knn/knn.py +++ b/mmdet3d/ops/knn/knn.py @@ -1,3 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. import torch from torch.autograd import Function @@ -27,11 +28,11 @@ def forward(ctx, center_xyz (Tensor): (B, npoint, 3) if transposed == False, else (B, 3, npoint). centers of the knn query. transposed (bool): whether the input tensors are transposed. - defaults to False. Should not expicitly use this keyword + defaults to False. Should not explicitly use this keyword when calling knn (=KNN.apply), just add the fourth param. Returns: - Tensor: (B, k, npoint) tensor with the indicies of + Tensor: (B, k, npoint) tensor with the indices of the features that form k-nearest neighbours. """ assert k > 0 diff --git a/mmdet3d/ops/norm.py b/mmdet3d/ops/norm.py index e9db8fb579..52a1363d2e 100644 --- a/mmdet3d/ops/norm.py +++ b/mmdet3d/ops/norm.py @@ -1,3 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. import torch from mmcv.cnn import NORM_LAYERS from mmcv.runner import force_fp32 @@ -26,7 +27,7 @@ def backward(ctx, grad_output): @NORM_LAYERS.register_module('naiveSyncBN1d') class NaiveSyncBatchNorm1d(nn.BatchNorm1d): - """Syncronized Batch Normalization for 3D Tensors. + """Synchronized Batch Normalization for 3D Tensors. Note: This implementation is modified from @@ -37,7 +38,7 @@ class NaiveSyncBatchNorm1d(nn.BatchNorm1d): when the batch size on each worker is quite different (e.g., when scale augmentation is used). In 3D detection, different workers has points of different shapes, - whish also cause instability. + which also cause instability. Use this implementation before `nn.SyncBatchNorm` is fixed. It is slower than `nn.SyncBatchNorm`. @@ -80,7 +81,7 @@ def forward(self, input): @NORM_LAYERS.register_module('naiveSyncBN2d') class NaiveSyncBatchNorm2d(nn.BatchNorm2d): - """Syncronized Batch Normalization for 4D Tensors. + """Synchronized Batch Normalization for 4D Tensors. Note: This implementation is modified from diff --git a/mmdet3d/ops/paconv/__init__.py b/mmdet3d/ops/paconv/__init__.py index ab19d7c20f..f723c14378 100644 --- a/mmdet3d/ops/paconv/__init__.py +++ b/mmdet3d/ops/paconv/__init__.py @@ -1,3 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. from .assign_score import assign_score_withk from .paconv import PAConv, PAConvCUDA diff --git a/mmdet3d/ops/paconv/assign_score.py b/mmdet3d/ops/paconv/assign_score.py index 3e3f6730da..9d00b7b6cf 100644 --- a/mmdet3d/ops/paconv/assign_score.py +++ b/mmdet3d/ops/paconv/assign_score.py @@ -1,3 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. from torch.autograd import Function from . import assign_score_withk_ext diff --git a/mmdet3d/ops/paconv/paconv.py b/mmdet3d/ops/paconv/paconv.py index 3e401a49b8..1f53deb8bb 100644 --- a/mmdet3d/ops/paconv/paconv.py +++ b/mmdet3d/ops/paconv/paconv.py @@ -1,4 +1,6 @@ +# Copyright (c) OpenMMLab. All rights reserved. import copy + import torch from mmcv.cnn import (ConvModule, build_activation_layer, build_norm_layer, constant_init) @@ -83,7 +85,7 @@ def forward(self, xyz_features): Args: xyz_features (torch.Tensor): (B, C, N, K), features constructed from xyz coordinates of point pairs. May contain relative - positions, Euclidian distance, etc. + positions, Euclidean distance, etc. Returns: torch.Tensor: (B, N, K, M), predicted scores for `M` kernels. @@ -174,7 +176,7 @@ def __init__(self, # (grouped_xyz - center_xyz, grouped_xyz) self.scorenet_in_channels = 6 elif scorenet_input == 'w_neighbor_dist': - # (center_xyz, grouped_xyz - center_xyz, Euclidian distance) + # (center_xyz, grouped_xyz - center_xyz, Euclidean distance) self.scorenet_in_channels = 7 else: raise NotImplementedError( diff --git a/mmdet3d/ops/paconv/utils.py b/mmdet3d/ops/paconv/utils.py index 4441c86e20..68e71d51d6 100644 --- a/mmdet3d/ops/paconv/utils.py +++ b/mmdet3d/ops/paconv/utils.py @@ -1,15 +1,16 @@ +# Copyright (c) OpenMMLab. All rights reserved. import torch def calc_euclidian_dist(xyz1, xyz2): - """Calculate the Euclidian distance between two sets of points. + """Calculate the Euclidean distance between two sets of points. Args: xyz1 (torch.Tensor): (N, 3), the first set of points. xyz2 (torch.Tensor): (N, 3), the second set of points. Returns: - torch.Tensor: (N, ), the Euclidian distance between each point pair. + torch.Tensor: (N, ), the Euclidean distance between each point pair. """ assert xyz1.shape[0] == xyz2.shape[0], 'number of points are not the same' assert xyz1.shape[1] == xyz2.shape[1] == 3, \ diff --git a/mmdet3d/ops/pointnet_modules/paconv_sa_module.py b/mmdet3d/ops/pointnet_modules/paconv_sa_module.py index e2deb7ef65..361ecbb216 100644 --- a/mmdet3d/ops/pointnet_modules/paconv_sa_module.py +++ b/mmdet3d/ops/pointnet_modules/paconv_sa_module.py @@ -28,7 +28,7 @@ class PAConvSAModuleMSG(BasePointSAModule): - 'w_neighbor': Use xyz coordinates and the difference with center points as input. - 'w_neighbor_dist': Use xyz coordinates, the difference with - center points and the Euclidian distance as input. + center points and the Euclidean distance as input. scorenet_cfg (dict, optional): Config of the ScoreNet module, which may contain the following keys and values: @@ -239,11 +239,12 @@ def forward( Args: points_xyz (Tensor): (B, N, 3) xyz coordinates of the features. - features (Tensor): (B, C, N) features of each point. + features (Tensor, optional): (B, C, N) features of each point. Default: None. - indices (Tensor): (B, num_point) Index of the features. + indices (Tensor, optional): (B, num_point) Index of the features. + Default: None. + target_xyz (Tensor, optional): (B, M, 3) new coords of the outputs. Default: None. - target_xyz (Tensor): (B, M, 3) new_xyz coordinates of the outputs. Returns: Tensor: (B, M, 3) where M is the number of points. diff --git a/mmdet3d/ops/pointnet_modules/point_fp_module.py b/mmdet3d/ops/pointnet_modules/point_fp_module.py index 6f4ba8eac0..4915bb08a4 100644 --- a/mmdet3d/ops/pointnet_modules/point_fp_module.py +++ b/mmdet3d/ops/pointnet_modules/point_fp_module.py @@ -1,9 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. +from typing import List + import torch from mmcv.cnn import ConvModule from mmcv.runner import BaseModule, force_fp32 from torch import nn as nn -from typing import List from mmdet3d.ops import three_interpolate, three_nn @@ -15,7 +16,7 @@ class PointFPModule(BaseModule): Args: mlp_channels (list[int]): List of mlp channels. - norm_cfg (dict): Type of normalization method. + norm_cfg (dict, optional): Type of normalization method. Default: dict(type='BN2d'). """ diff --git a/mmdet3d/ops/pointnet_modules/point_sa_module.py b/mmdet3d/ops/pointnet_modules/point_sa_module.py index 687af4daa2..193b3690a9 100644 --- a/mmdet3d/ops/pointnet_modules/point_sa_module.py +++ b/mmdet3d/ops/pointnet_modules/point_sa_module.py @@ -18,25 +18,25 @@ class BasePointSAModule(nn.Module): sample_nums (list[int]): Number of samples in each ball query. mlp_channels (list[list[int]]): Specify of the pointnet before the global pooling for each scale. - fps_mod (list[str]: Type of FPS method, valid mod + fps_mod (list[str], optional): Type of FPS method, valid mod ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS']. F-FPS: using feature distances for FPS. D-FPS: using Euclidean distances of points for FPS. FS: using F-FPS and D-FPS simultaneously. - fps_sample_range_list (list[int]): Range of points to apply FPS. - Default: [-1]. - dilated_group (bool): Whether to use dilated ball query. + fps_sample_range_list (list[int], optional): + Range of points to apply FPS. Default: [-1]. + dilated_group (bool, optional): Whether to use dilated ball query. Default: False. - use_xyz (bool): Whether to use xyz. + use_xyz (bool, optional): Whether to use xyz. Default: True. - pool_mod (str): Type of pooling method. + pool_mod (str, optional): Type of pooling method. Default: 'max_pool'. - normalize_xyz (bool): Whether to normalize local XYZ with radius. - Default: False. - grouper_return_grouped_xyz (bool): Whether to return grouped xyz in - `QueryAndGroup`. Defaults to False. - grouper_return_grouped_idx (bool): Whether to return grouped idx in - `QueryAndGroup`. Defaults to False. + normalize_xyz (bool, optional): Whether to normalize local XYZ + with radius. Default: False. + grouper_return_grouped_xyz (bool, optional): Whether to return + grouped xyz in `QueryAndGroup`. Defaults to False. + grouper_return_grouped_idx (bool, optional): Whether to return + grouped idx in `QueryAndGroup`. Defaults to False. """ def __init__(self, @@ -69,6 +69,8 @@ def __init__(self, self.num_point = [num_point] elif isinstance(num_point, list) or isinstance(num_point, tuple): self.num_point = num_point + elif num_point is None: + self.num_point = None else: raise NotImplementedError('Error type of num_point!') @@ -78,8 +80,12 @@ def __init__(self, self.fps_mod_list = fps_mod self.fps_sample_range_list = fps_sample_range_list - self.points_sampler = Points_Sampler(self.num_point, self.fps_mod_list, - self.fps_sample_range_list) + if self.num_point is not None: + self.points_sampler = Points_Sampler(self.num_point, + self.fps_mod_list, + self.fps_sample_range_list) + else: + self.points_sampler = None for i in range(len(radii)): radius = radii[i] @@ -111,9 +117,7 @@ def _sample_points(self, points_xyz, features, indices, target_xyz): Args: points_xyz (Tensor): (B, N, 3) xyz coordinates of the features. features (Tensor): (B, C, N) features of each point. - Default: None. indices (Tensor): (B, num_point) Index of the features. - Default: None. target_xyz (Tensor): (B, M, 3) new_xyz coordinates of the outputs. Returns: @@ -128,9 +132,12 @@ def _sample_points(self, points_xyz, features, indices, target_xyz): elif target_xyz is not None: new_xyz = target_xyz.contiguous() else: - indices = self.points_sampler(points_xyz, features) - new_xyz = gather_points(xyz_flipped, indices).transpose( - 1, 2).contiguous() if self.num_point is not None else None + if self.num_point is not None: + indices = self.points_sampler(points_xyz, features) + new_xyz = gather_points(xyz_flipped, + indices).transpose(1, 2).contiguous() + else: + new_xyz = None return new_xyz, indices @@ -169,11 +176,12 @@ def forward( Args: points_xyz (Tensor): (B, N, 3) xyz coordinates of the features. - features (Tensor): (B, C, N) features of each point. + features (Tensor, optional): (B, C, N) features of each point. Default: None. - indices (Tensor): (B, num_point) Index of the features. + indices (Tensor, optional): (B, num_point) Index of the features. + Default: None. + target_xyz (Tensor, optional): (B, M, 3) new coords of the outputs. Default: None. - target_xyz (Tensor): (B, M, 3) new_xyz coordinates of the outputs. Returns: Tensor: (B, M, 3) where M is the number of points. @@ -223,26 +231,26 @@ class PointSAModuleMSG(BasePointSAModule): sample_nums (list[int]): Number of samples in each ball query. mlp_channels (list[list[int]]): Specify of the pointnet before the global pooling for each scale. - fps_mod (list[str]: Type of FPS method, valid mod + fps_mod (list[str], optional): Type of FPS method, valid mod ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS']. F-FPS: using feature distances for FPS. D-FPS: using Euclidean distances of points for FPS. FS: using F-FPS and D-FPS simultaneously. - fps_sample_range_list (list[int]): Range of points to apply FPS. - Default: [-1]. - dilated_group (bool): Whether to use dilated ball query. + fps_sample_range_list (list[int], optional): Range of points to + apply FPS. Default: [-1]. + dilated_group (bool, optional): Whether to use dilated ball query. Default: False. - norm_cfg (dict): Type of normalization method. + norm_cfg (dict, optional): Type of normalization method. Default: dict(type='BN2d'). - use_xyz (bool): Whether to use xyz. + use_xyz (bool, optional): Whether to use xyz. Default: True. - pool_mod (str): Type of pooling method. + pool_mod (str, optional): Type of pooling method. Default: 'max_pool'. - normalize_xyz (bool): Whether to normalize local XYZ with radius. - Default: False. - bias (bool | str): If specified as `auto`, it will be decided by the - norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise - False. Default: "auto". + normalize_xyz (bool, optional): Whether to normalize local XYZ + with radius. Default: False. + bias (bool | str, optional): If specified as `auto`, it will be + decided by `norm_cfg`. `bias` will be set as True if + `norm_cfg` is None, otherwise False. Default: 'auto'. """ def __init__(self, @@ -298,24 +306,24 @@ class PointSAModule(PointSAModuleMSG): Args: mlp_channels (list[int]): Specify of the pointnet before the global pooling for each scale. - num_point (int): Number of points. + num_point (int, optional): Number of points. Default: None. - radius (float): Radius to group with. + radius (float, optional): Radius to group with. Default: None. - num_sample (int): Number of samples in each ball query. + num_sample (int, optional): Number of samples in each ball query. Default: None. - norm_cfg (dict): Type of normalization method. + norm_cfg (dict, optional): Type of normalization method. Default: dict(type='BN2d'). - use_xyz (bool): Whether to use xyz. + use_xyz (bool, optional): Whether to use xyz. Default: True. - pool_mod (str): Type of pooling method. + pool_mod (str, optional): Type of pooling method. Default: 'max_pool'. - fps_mod (list[str]: Type of FPS method, valid mod + fps_mod (list[str], optional): Type of FPS method, valid mod ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS']. - fps_sample_range_list (list[int]): Range of points to apply FPS. - Default: [-1]. - normalize_xyz (bool): Whether to normalize local XYZ with radius. - Default: False. + fps_sample_range_list (list[int], optional): Range of points + to apply FPS. Default: [-1]. + normalize_xyz (bool, optional): Whether to normalize local XYZ + with radius. Default: False. """ def __init__(self, diff --git a/mmdet3d/ops/roiaware_pool3d/__init__.py b/mmdet3d/ops/roiaware_pool3d/__init__.py index aba9e18d37..627e4a743e 100644 --- a/mmdet3d/ops/roiaware_pool3d/__init__.py +++ b/mmdet3d/ops/roiaware_pool3d/__init__.py @@ -1,8 +1,9 @@ -from .points_in_boxes import (points_in_boxes_batch, points_in_boxes_cpu, - points_in_boxes_gpu) +# Copyright (c) OpenMMLab. All rights reserved. +from .points_in_boxes import (points_in_boxes_all, points_in_boxes_cpu, + points_in_boxes_part) from .roiaware_pool3d import RoIAwarePool3d __all__ = [ - 'RoIAwarePool3d', 'points_in_boxes_gpu', 'points_in_boxes_cpu', - 'points_in_boxes_batch' + 'RoIAwarePool3d', 'points_in_boxes_part', 'points_in_boxes_cpu', + 'points_in_boxes_all' ] diff --git a/mmdet3d/ops/roiaware_pool3d/points_in_boxes.py b/mmdet3d/ops/roiaware_pool3d/points_in_boxes.py index f576fedcc5..1a8c2474a4 100644 --- a/mmdet3d/ops/roiaware_pool3d/points_in_boxes.py +++ b/mmdet3d/ops/roiaware_pool3d/points_in_boxes.py @@ -1,23 +1,24 @@ +# Copyright (c) OpenMMLab. All rights reserved. import torch from . import roiaware_pool3d_ext -def points_in_boxes_gpu(points, boxes): - """Find points that are in boxes (CUDA) +def points_in_boxes_part(points, boxes): + """Find the box in which each point is (CUDA). Args: - points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR coordinate + points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate boxes (torch.Tensor): [B, T, 7], - num_valid_boxes <= T, [x, y, z, w, l, h, ry] in LiDAR coordinate, - (x, y, z) is the bottom center + num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz] in + LiDAR/DEPTH coordinate, (x, y, z) is the bottom center Returns: box_idxs_of_pts (torch.Tensor): (B, M), default background = -1 """ - assert boxes.shape[0] == points.shape[0], \ + assert points.shape[0] == boxes.shape[0], \ f'Points and boxes should have the same batch size, ' \ - f'got {boxes.shape[0]} and {boxes.shape[0]}' + f'got {points.shape[0]} and {boxes.shape[0]}' assert boxes.shape[2] == 7, \ f'boxes dimension should be 7, ' \ f'got unexpected shape {boxes.shape[2]}' @@ -43,56 +44,61 @@ def points_in_boxes_gpu(points, boxes): if torch.cuda.current_device() != points_device: torch.cuda.set_device(points_device) - roiaware_pool3d_ext.points_in_boxes_gpu(boxes.contiguous(), - points.contiguous(), - box_idxs_of_pts) + roiaware_pool3d_ext.points_in_boxes_part(boxes.contiguous(), + points.contiguous(), + box_idxs_of_pts) return box_idxs_of_pts def points_in_boxes_cpu(points, boxes): - """Find points that are in boxes (CPU) - - Note: - Currently, the output of this function is different from that of - points_in_boxes_gpu. + """Find all boxes in which each point is (CPU). The CPU version of + :meth:`points_in_boxes_all`. Args: - points (torch.Tensor): [npoints, 3] - boxes (torch.Tensor): [N, 7], in LiDAR coordinate, - (x, y, z) is the bottom center + points (torch.Tensor): [B, M, 3], [x, y, z] in + LiDAR/DEPTH coordinate + boxes (torch.Tensor): [B, T, 7], + num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz], + (x, y, z) is the bottom center. Returns: - point_indices (torch.Tensor): (N, npoints) + box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0. """ - # TODO: Refactor this function as a CPU version of points_in_boxes_gpu - assert boxes.shape[1] == 7, \ + assert points.shape[0] == boxes.shape[0], \ + f'Points and boxes should have the same batch size, ' \ + f'got {points.shape[0]} and {boxes.shape[0]}' + assert boxes.shape[2] == 7, \ f'boxes dimension should be 7, ' \ f'got unexpected shape {boxes.shape[2]}' - assert points.shape[1] == 3, \ + assert points.shape[2] == 3, \ f'points dimension should be 3, ' \ f'got unexpected shape {points.shape[2]}' + batch_size, num_points, _ = points.shape + num_boxes = boxes.shape[1] - point_indices = points.new_zeros((boxes.shape[0], points.shape[0]), + point_indices = points.new_zeros((batch_size, num_boxes, num_points), dtype=torch.int) - roiaware_pool3d_ext.points_in_boxes_cpu(boxes.float().contiguous(), - points.float().contiguous(), - point_indices) + for b in range(batch_size): + roiaware_pool3d_ext.points_in_boxes_cpu(boxes[b].float().contiguous(), + points[b].float().contiguous(), + point_indices[b]) + point_indices = point_indices.transpose(1, 2) return point_indices -def points_in_boxes_batch(points, boxes): - """Find points that are in boxes (CUDA) +def points_in_boxes_all(points, boxes): + """Find all boxes in which each point is (CUDA). Args: - points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR coordinate + points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate boxes (torch.Tensor): [B, T, 7], - num_valid_boxes <= T, [x, y, z, w, l, h, ry] in LiDAR coordinate, + num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz], (x, y, z) is the bottom center. Returns: - box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0 + box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0. """ assert boxes.shape[0] == points.shape[0], \ f'Points and boxes should have the same batch size, ' \ @@ -116,8 +122,8 @@ def points_in_boxes_batch(points, boxes): if torch.cuda.current_device() != points_device: torch.cuda.set_device(points_device) - roiaware_pool3d_ext.points_in_boxes_batch(boxes.contiguous(), - points.contiguous(), - box_idxs_of_pts) + roiaware_pool3d_ext.points_in_boxes_all(boxes.contiguous(), + points.contiguous(), + box_idxs_of_pts) return box_idxs_of_pts diff --git a/mmdet3d/ops/roiaware_pool3d/roiaware_pool3d.py b/mmdet3d/ops/roiaware_pool3d/roiaware_pool3d.py index 231d718068..32c00fcac7 100644 --- a/mmdet3d/ops/roiaware_pool3d/roiaware_pool3d.py +++ b/mmdet3d/ops/roiaware_pool3d/roiaware_pool3d.py @@ -1,3 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. import mmcv import torch from torch import nn as nn diff --git a/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cpu.cpp b/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cpu.cpp index a26ffb62bb..f8c5494d2e 100644 --- a/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cpu.cpp +++ b/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cpu.cpp @@ -15,9 +15,7 @@ inline void lidar_to_local_coords_cpu(float shift_x, float shift_y, float rz, float &local_x, float &local_y) { - // should rotate pi/2 + alpha to translate LiDAR to local - float rot_angle = rz + M_PI / 2; - float cosa = cos(rot_angle), sina = sin(rot_angle); + float cosa = cos(-rz), sina = sin(-rz); local_x = shift_x * cosa + shift_y * (-sina); local_y = shift_x * sina + shift_y * cosa; } @@ -25,23 +23,23 @@ inline void lidar_to_local_coords_cpu(float shift_x, float shift_y, float rz, inline int check_pt_in_box3d_cpu(const float *pt, const float *box3d, float &local_x, float &local_y) { // param pt: (x, y, z) - // param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the + // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate, cz in the // bottom center float x = pt[0], y = pt[1], z = pt[2]; float cx = box3d[0], cy = box3d[1], cz = box3d[2]; - float w = box3d[3], l = box3d[4], h = box3d[5], rz = box3d[6]; - cz += h / 2.0; // shift to the center since cz in box3d is the bottom center + float x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6]; + cz += z_size / 2.0; // shift to the center since cz in box3d is the bottom center - if (fabsf(z - cz) > h / 2.0) return 0; + if (fabsf(z - cz) > z_size / 2.0) return 0; lidar_to_local_coords_cpu(x - cx, y - cy, rz, local_x, local_y); - float in_flag = (local_x > -l / 2.0) & (local_x < l / 2.0) & - (local_y > -w / 2.0) & (local_y < w / 2.0); + float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) & + (local_y > -y_size / 2.0) & (local_y < y_size / 2.0); return in_flag; } int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tensor pts_indices_tensor) { - // params boxes: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the + // params boxes: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is the // bottom center, each box DO NOT overlaps params pts: (npoints, 3) [x, y, z] // in LiDAR coordinate params pts_indices: (N, npoints) diff --git a/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu b/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu index 896b316e69..4b90897e3a 100644 --- a/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu +++ b/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu @@ -24,9 +24,7 @@ __device__ inline void lidar_to_local_coords(float shift_x, float shift_y, float rz, float &local_x, float &local_y) { - // should rotate pi/2 + alpha to translate LiDAR to local - float rot_angle = rz + M_PI / 2; - float cosa = cos(rot_angle), sina = sin(rot_angle); + float cosa = cos(-rz), sina = sin(-rz); local_x = shift_x * cosa + shift_y * (-sina); local_y = shift_x * sina + shift_y * cosa; } @@ -34,25 +32,25 @@ __device__ inline void lidar_to_local_coords(float shift_x, float shift_y, __device__ inline int check_pt_in_box3d(const float *pt, const float *box3d, float &local_x, float &local_y) { // param pt: (x, y, z) - // param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the + // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate, cz in the // bottom center float x = pt[0], y = pt[1], z = pt[2]; float cx = box3d[0], cy = box3d[1], cz = box3d[2]; - float w = box3d[3], l = box3d[4], h = box3d[5], rz = box3d[6]; - cz += h / 2.0; // shift to the center since cz in box3d is the bottom center + float x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6]; + cz += z_size / 2.0; // shift to the center since cz in box3d is the bottom center - if (fabsf(z - cz) > h / 2.0) return 0; + if (fabsf(z - cz) > z_size / 2.0) return 0; lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y); - float in_flag = (local_x > -l / 2.0) & (local_x < l / 2.0) & - (local_y > -w / 2.0) & (local_y < w / 2.0); + float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) & + (local_y > -y_size / 2.0) & (local_y < y_size / 2.0); return in_flag; } -__global__ void points_in_boxes_kernel(int batch_size, int boxes_num, - int pts_num, const float *boxes, - const float *pts, - int *box_idx_of_points) { - // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is +__global__ void points_in_boxes_part_kernel(int batch_size, int boxes_num, + int pts_num, const float *boxes, + const float *pts, + int *box_idx_of_points) { + // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x, // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default // -1 @@ -76,11 +74,11 @@ __global__ void points_in_boxes_kernel(int batch_size, int boxes_num, } } -__global__ void points_in_boxes_batch_kernel(int batch_size, int boxes_num, - int pts_num, const float *boxes, - const float *pts, - int *box_idx_of_points) { - // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is +__global__ void points_in_boxes_all_kernel(int batch_size, int boxes_num, + int pts_num, const float *boxes, + const float *pts, + int *box_idx_of_points) { + // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x, // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default // -1 @@ -104,10 +102,10 @@ __global__ void points_in_boxes_batch_kernel(int batch_size, int boxes_num, } } -void points_in_boxes_launcher(int batch_size, int boxes_num, int pts_num, - const float *boxes, const float *pts, - int *box_idx_of_points) { - // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is +void points_in_boxes_part_launcher(int batch_size, int boxes_num, int pts_num, + const float *boxes, const float *pts, + int *box_idx_of_points) { + // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x, // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default // -1 @@ -115,8 +113,8 @@ void points_in_boxes_launcher(int batch_size, int boxes_num, int pts_num, dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size); dim3 threads(THREADS_PER_BLOCK); - points_in_boxes_kernel<<>>(batch_size, boxes_num, pts_num, - boxes, pts, box_idx_of_points); + points_in_boxes_part_kernel<<>>(batch_size, boxes_num, pts_num, + boxes, pts, box_idx_of_points); err = cudaGetLastError(); if (cudaSuccess != err) { @@ -129,17 +127,17 @@ void points_in_boxes_launcher(int batch_size, int boxes_num, int pts_num, #endif } -void points_in_boxes_batch_launcher(int batch_size, int boxes_num, int pts_num, - const float *boxes, const float *pts, - int *box_idx_of_points) { - // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is +void points_in_boxes_all_launcher(int batch_size, int boxes_num, int pts_num, + const float *boxes, const float *pts, + int *box_idx_of_points) { + // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is // the bottom center, each box params pts: (B, npoints, 3) [x, y, z] in // LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1 cudaError_t err; dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size); dim3 threads(THREADS_PER_BLOCK); - points_in_boxes_batch_kernel<<>>( + points_in_boxes_all_kernel<<>>( batch_size, boxes_num, pts_num, boxes, pts, box_idx_of_points); err = cudaGetLastError(); @@ -153,9 +151,9 @@ void points_in_boxes_batch_launcher(int batch_size, int boxes_num, int pts_num, #endif } -int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, - at::Tensor box_idx_of_points_tensor) { - // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is +int points_in_boxes_part(at::Tensor boxes_tensor, at::Tensor pts_tensor, + at::Tensor box_idx_of_points_tensor) { + // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x, // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default // -1 @@ -172,15 +170,15 @@ int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, const float *pts = pts_tensor.data_ptr(); int *box_idx_of_points = box_idx_of_points_tensor.data_ptr(); - points_in_boxes_launcher(batch_size, boxes_num, pts_num, boxes, pts, - box_idx_of_points); + points_in_boxes_part_launcher(batch_size, boxes_num, pts_num, boxes, pts, + box_idx_of_points); return 1; } -int points_in_boxes_batch(at::Tensor boxes_tensor, at::Tensor pts_tensor, - at::Tensor box_idx_of_points_tensor) { - // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is +int points_in_boxes_all(at::Tensor boxes_tensor, at::Tensor pts_tensor, + at::Tensor box_idx_of_points_tensor) { + // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is // the bottom center. params pts: (B, npoints, 3) [x, y, z] in LiDAR // coordinate params boxes_idx_of_points: (B, npoints), default -1 @@ -196,8 +194,8 @@ int points_in_boxes_batch(at::Tensor boxes_tensor, at::Tensor pts_tensor, const float *pts = pts_tensor.data_ptr(); int *box_idx_of_points = box_idx_of_points_tensor.data_ptr(); - points_in_boxes_batch_launcher(batch_size, boxes_num, pts_num, boxes, pts, - box_idx_of_points); + points_in_boxes_all_launcher(batch_size, boxes_num, pts_num, boxes, pts, + box_idx_of_points); return 1; } diff --git a/mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d.cpp b/mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d.cpp index cd743b18bb..607d783eb5 100644 --- a/mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d.cpp +++ b/mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d.cpp @@ -40,16 +40,16 @@ int roiaware_pool3d_gpu_backward(at::Tensor pts_idx_of_voxels, int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tensor pts_indices_tensor); -int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, - at::Tensor box_idx_of_points_tensor); +int points_in_boxes_part(at::Tensor boxes_tensor, at::Tensor pts_tensor, + at::Tensor box_idx_of_points_tensor); -int points_in_boxes_batch(at::Tensor boxes_tensor, at::Tensor pts_tensor, - at::Tensor box_idx_of_points_tensor); +int points_in_boxes_all(at::Tensor boxes_tensor, at::Tensor pts_tensor, + at::Tensor box_idx_of_points_tensor); int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature, at::Tensor argmax, at::Tensor pts_idx_of_voxels, at::Tensor pooled_features, int pool_method) { - // params rois: (N, 7) [x, y, z, w, l, h, ry] in LiDAR coordinate + // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, ry] in LiDAR coordinate // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate // params pts_feature: (npoints, C) // params argmax: (N, out_x, out_y, out_z, C) @@ -127,10 +127,10 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("forward", &roiaware_pool3d_gpu, "roiaware pool3d forward (CUDA)"); m.def("backward", &roiaware_pool3d_gpu_backward, "roiaware pool3d backward (CUDA)"); - m.def("points_in_boxes_gpu", &points_in_boxes_gpu, - "points_in_boxes_gpu forward (CUDA)"); - m.def("points_in_boxes_batch", &points_in_boxes_batch, - "points_in_boxes_batch forward (CUDA)"); + m.def("points_in_boxes_part", &points_in_boxes_part, + "points_in_boxes_part forward (CUDA)"); + m.def("points_in_boxes_all", &points_in_boxes_all, + "points_in_boxes_all forward (CUDA)"); m.def("points_in_boxes_cpu", &points_in_boxes_cpu, "points_in_boxes_cpu forward (CPU)"); } diff --git a/mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu b/mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu index 312b35dcbf..8f62e891de 100644 --- a/mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu +++ b/mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu @@ -17,9 +17,7 @@ __device__ inline void lidar_to_local_coords(float shift_x, float shift_y, float rz, float &local_x, float &local_y) { - // should rotate pi/2 + alpha to translate LiDAR to local - float rot_angle = rz + M_PI / 2; - float cosa = cos(rot_angle), sina = sin(rot_angle); + float cosa = cos(-rz), sina = sin(-rz); local_x = shift_x * cosa + shift_y * (-sina); local_y = shift_x * sina + shift_y * cosa; } @@ -27,17 +25,17 @@ __device__ inline void lidar_to_local_coords(float shift_x, float shift_y, __device__ inline int check_pt_in_box3d(const float *pt, const float *box3d, float &local_x, float &local_y) { // param pt: (x, y, z) - // param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the + // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate, cz in the // bottom center float x = pt[0], y = pt[1], z = pt[2]; float cx = box3d[0], cy = box3d[1], cz = box3d[2]; - float w = box3d[3], l = box3d[4], h = box3d[5], rz = box3d[6]; - cz += h / 2.0; // shift to the center since cz in box3d is the bottom center + float x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6]; + cz += z_size / 2.0; // shift to the center since cz in box3d is the bottom center - if (fabsf(z - cz) > h / 2.0) return 0; + if (fabsf(z - cz) > z_size / 2.0) return 0; lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y); - float in_flag = (local_x > -l / 2.0) & (local_x < l / 2.0) & - (local_y > -w / 2.0) & (local_y < w / 2.0); + float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) & + (local_y > -y_size / 2.0) & (local_y < y_size / 2.0); return in_flag; } @@ -45,9 +43,9 @@ __global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num, int out_x, int out_y, int out_z, const float *rois, const float *pts, int *pts_mask) { - // params rois: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate + // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate // params pts: (npoints, 3) [x, y, z] - // params pts_mask: (N, npoints): -1 means point doesnot in this box, + // params pts_mask: (N, npoints): -1 means point does not in this box, // otherwise: encode (x_idxs, y_idxs, z_idxs) by binary bit int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; int box_idx = blockIdx.y; @@ -63,14 +61,14 @@ __global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num, pts_mask[0] = -1; if (cur_in_flag > 0) { float local_z = pts[2] - rois[2]; - float w = rois[3], l = rois[4], h = rois[5]; + float x_size = rois[3], y_size = rois[4], z_size = rois[5]; - float x_res = l / out_x; - float y_res = w / out_y; - float z_res = h / out_z; + float x_res = x_size / out_x; + float y_res = y_size / out_y; + float z_res = z_size / out_z; - unsigned int x_idx = int((local_x + l / 2) / x_res); - unsigned int y_idx = int((local_y + w / 2) / y_res); + unsigned int x_idx = int((local_x + x_size / 2) / x_res); + unsigned int y_idx = int((local_y + y_size / 2) / y_res); unsigned int z_idx = int(local_z / z_res); x_idx = min(max(x_idx, 0), out_x - 1); @@ -231,7 +229,7 @@ void roiaware_pool3d_launcher(int boxes_num, int pts_num, int channels, const float *pts_feature, int *argmax, int *pts_idx_of_voxels, float *pooled_features, int pool_method) { - // params rois: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate + // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate // params pts_feature: (npoints, C) // params argmax: (N, out_x, out_y, out_z, C) diff --git a/mmdet3d/ops/roipoint_pool3d/__init__.py b/mmdet3d/ops/roipoint_pool3d/__init__.py new file mode 100644 index 0000000000..263f2dcd41 --- /dev/null +++ b/mmdet3d/ops/roipoint_pool3d/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .roipoint_pool3d import RoIPointPool3d + +__all__ = ['RoIPointPool3d'] diff --git a/mmdet3d/ops/roipoint_pool3d/roipoint_pool3d.py b/mmdet3d/ops/roipoint_pool3d/roipoint_pool3d.py new file mode 100644 index 0000000000..5b35f46dc2 --- /dev/null +++ b/mmdet3d/ops/roipoint_pool3d/roipoint_pool3d.py @@ -0,0 +1,72 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from torch import nn as nn +from torch.autograd import Function + +from . import roipoint_pool3d_ext + + +class RoIPointPool3d(nn.Module): + + def __init__(self, num_sampled_points=512): + super().__init__() + """ + Args: + num_sampled_points (int): Number of samples in each roi + """ + self.num_sampled_points = num_sampled_points + + def forward(self, points, point_features, boxes3d): + """ + Args: + points (torch.Tensor): Input points whose shape is BxNx3 + point_features: (B, N, C) + boxes3d: (B, M, 7), [x, y, z, dx, dy, dz, heading] + + Returns: + torch.Tensor: (B, M, 512, 3 + C) pooled_features + torch.Tensor: (B, M) pooled_empty_flag + """ + return RoIPointPool3dFunction.apply(points, point_features, boxes3d, + self.num_sampled_points) + + +class RoIPointPool3dFunction(Function): + + @staticmethod + def forward(ctx, points, point_features, boxes3d, num_sampled_points=512): + """ + Args: + points (torch.Tensor): Input points whose shape is (B, N, 3) + point_features (torch.Tensor): Input points features shape is \ + (B, N, C) + boxes3d (torch.Tensor): Input bounding boxes whose shape is \ + (B, M, 7) + num_sampled_points (int): the num of sampled points + + Returns: + torch.Tensor: (B, M, 512, 3 + C) pooled_features + torch.Tensor: (B, M) pooled_empty_flag + """ + assert points.shape.__len__() == 3 and points.shape[2] == 3 + batch_size, boxes_num, feature_len = points.shape[0], boxes3d.shape[ + 1], point_features.shape[2] + pooled_boxes3d = boxes3d.view(batch_size, -1, 7) + pooled_features = point_features.new_zeros( + (batch_size, boxes_num, num_sampled_points, 3 + feature_len)) + pooled_empty_flag = point_features.new_zeros( + (batch_size, boxes_num)).int() + + roipoint_pool3d_ext.forward(points.contiguous(), + pooled_boxes3d.contiguous(), + point_features.contiguous(), + pooled_features, pooled_empty_flag) + + return pooled_features, pooled_empty_flag + + @staticmethod + def backward(ctx, grad_out): + raise NotImplementedError + + +if __name__ == '__main__': + pass diff --git a/mmdet3d/ops/roipoint_pool3d/src/roipoint_pool3d.cpp b/mmdet3d/ops/roipoint_pool3d/src/roipoint_pool3d.cpp new file mode 100644 index 0000000000..9369b98482 --- /dev/null +++ b/mmdet3d/ops/roipoint_pool3d/src/roipoint_pool3d.cpp @@ -0,0 +1,66 @@ +/* +Modified for +https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu +Point cloud feature pooling +Written by Shaoshuai Shi +All Rights Reserved 2018. +*/ +#include +#include + +#define CHECK_CUDA(x) do { \ + if (!x.type().is_cuda()) { \ + fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \ + exit(-1); \ + } \ +} while (0) +#define CHECK_CONTIGUOUS(x) do { \ + if (!x.is_contiguous()) { \ + fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \ + exit(-1); \ + } \ +} while (0) +#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x) + + +void roipool3dLauncher(int batch_size, int pts_num, int boxes_num, int feature_in_len, int sampled_pts_num, + const float *xyz, const float *boxes3d, const float *pts_feature, float *pooled_features, int *pooled_empty_flag); + + +int roipool3d_gpu(at::Tensor xyz, at::Tensor boxes3d, at::Tensor pts_feature, at::Tensor pooled_features, at::Tensor pooled_empty_flag){ + // params xyz: (B, N, 3) + // params boxes3d: (B, M, 7) + // params pts_feature: (B, N, C) + // params pooled_features: (B, M, 512, 3+C) + // params pooled_empty_flag: (B, M) + CHECK_INPUT(xyz); + CHECK_INPUT(boxes3d); + CHECK_INPUT(pts_feature); + CHECK_INPUT(pooled_features); + CHECK_INPUT(pooled_empty_flag); + + int batch_size = xyz.size(0); + int pts_num = xyz.size(1); + int boxes_num = boxes3d.size(1); + int feature_in_len = pts_feature.size(2); + int sampled_pts_num = pooled_features.size(2); + + + const float * xyz_data = xyz.data(); + const float * boxes3d_data = boxes3d.data(); + const float * pts_feature_data = pts_feature.data(); + float * pooled_features_data = pooled_features.data(); + int * pooled_empty_flag_data = pooled_empty_flag.data(); + + roipool3dLauncher(batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, + xyz_data, boxes3d_data, pts_feature_data, pooled_features_data, pooled_empty_flag_data); + + + + return 1; +} + + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("forward", &roipool3d_gpu, "roipool3d forward (CUDA)"); +} diff --git a/mmdet3d/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu b/mmdet3d/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu new file mode 100644 index 0000000000..a63a4c7ec4 --- /dev/null +++ b/mmdet3d/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu @@ -0,0 +1,168 @@ +/* +Modified from +https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu +Point cloud feature pooling +Written by Shaoshuai Shi +All Rights Reserved 2018. +*/ + +#include +#include + +#define THREADS_PER_BLOCK 256 +#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) +// #define DEBUG + +__device__ inline void lidar_to_local_coords(float shift_x, float shift_y, + float rz, float &local_x, + float &local_y) { + float cosa = cos(-rz), sina = sin(-rz); + local_x = shift_x * cosa + shift_y * (-sina); + local_y = shift_x * sina + shift_y * cosa; +} + +__device__ inline int check_pt_in_box3d(const float *pt, const float *box3d, + float &local_x, float &local_y) { + // param pt: (x, y, z) + // param box3d: (cx, cy, cz, dx, dy, dz, rz) in LiDAR coordinate, cz in the + // bottom center + float x = pt[0], y = pt[1], z = pt[2]; + float cx = box3d[0], cy = box3d[1], cz = box3d[2]; + float dx = box3d[3], dy = box3d[4], dz = box3d[5], rz = box3d[6]; + cz += dz / 2.0; // shift to the center since cz in box3d is the bottom center + + if (fabsf(z - cz) > dz / 2.0) return 0; + lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y); + float in_flag = (local_x > -dx / 2.0) & (local_x < dx / 2.0) & + (local_y > -dy / 2.0) & (local_y < dy / 2.0); + return in_flag; +} + +__global__ void assign_pts_to_box3d(int batch_size, int pts_num, int boxes_num, const float *xyz, const float *boxes3d, int *pts_assign){ + // params xyz: (B, N, 3) + // params boxes3d: (B, M, 7) + // params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means background points + int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; + int box_idx = blockIdx.y; + int bs_idx = blockIdx.z; + + if (pt_idx >= pts_num || box_idx >= boxes_num || bs_idx >= batch_size){ + return; + } + int assign_idx = bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx; + pts_assign[assign_idx] = 0; + + int box_offset = bs_idx * boxes_num * 7 + box_idx * 7; + int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3; + + + float local_x = 0, local_y = 0; + int cur_in_flag = check_pt_in_box3d(xyz + pt_offset, boxes3d + box_offset, local_x, local_y); + pts_assign[assign_idx] = cur_in_flag; + // printf("bs=%d, pt=%d, in=%d\n", bs_idx, pt_idx, pts_assign[bs_idx * pts_num + pt_idx]); +} + + +__global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num, int sampled_pts_num, + const int *pts_assign, int *pts_idx, int *pooled_empty_flag){ + // params xyz: (B, N, 3) + // params pts_feature: (B, N, C) + // params pts_assign: (B, N) + // params pts_idx: (B, M, 512) + // params pooled_empty_flag: (B, M) + + int boxes_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (boxes_idx >= boxes_num){ + return; + } + + int bs_idx = blockIdx.y; + + int cnt = 0; + for (int k = 0; k < pts_num; k++){ + if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num + boxes_idx]){ + if (cnt < sampled_pts_num){ + pts_idx[bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num + cnt] = k; + cnt++; + } + else break; + } + } + + if (cnt == 0){ + pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1; + } + else if (cnt < sampled_pts_num){ + // duplicate same points for sampling + for (int k = cnt; k < sampled_pts_num; k++){ + int duplicate_idx = k % cnt; + int base_offset = bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num; + pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx]; + } + } +} + + +__global__ void roipool3d_forward(int batch_size, int pts_num, int boxes_num, int feature_in_len, int sampled_pts_num, + const float *xyz, const int *pts_idx, const float *pts_feature, + float *pooled_features, int *pooled_empty_flag){ + // params xyz: (B, N, 3) + // params pts_idx: (B, M, 512) + // params pts_feature: (B, N, C) + // params pooled_features: (B, M, 512, 3+C) + // params pooled_empty_flag: (B, M) + + int sample_pt_idx = blockIdx.x * blockDim.x + threadIdx.x; + int box_idx = blockIdx.y; + int bs_idx = blockIdx.z; + + if (sample_pt_idx >= sampled_pts_num || box_idx >= boxes_num || bs_idx >= batch_size){ + return; + } + + if (pooled_empty_flag[bs_idx * boxes_num + box_idx]){ + return; + } + + int temp_idx = bs_idx * boxes_num * sampled_pts_num + box_idx * sampled_pts_num + sample_pt_idx; + int src_pt_idx = pts_idx[temp_idx]; + int dst_feature_offset = temp_idx * (3 + feature_in_len); + + for (int j = 0; j < 3; j++) + pooled_features[dst_feature_offset + j] = xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j]; + + int src_feature_offset = bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len; + for (int j = 0; j < feature_in_len; j++) + pooled_features[dst_feature_offset + 3 + j] = pts_feature[src_feature_offset + j]; +} + + +void roipool3dLauncher(int batch_size, int pts_num, int boxes_num, int feature_in_len, int sampled_pts_num, + const float *xyz, const float *boxes3d, const float *pts_feature, float *pooled_features, int *pooled_empty_flag){ + + // printf("batch_size=%d, pts_num=%d, boxes_num=%d\n", batch_size, pts_num, boxes_num); + int *pts_assign = NULL; + cudaMalloc(&pts_assign, batch_size * pts_num * boxes_num * sizeof(int)); // (batch_size, N, M) + // cudaMemset(&pts_assign, -1, batch_size * pts_num * boxes_num * sizeof(int)); + + dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num, batch_size); // blockIdx.x(col), blockIdx.y(row) + dim3 threads(THREADS_PER_BLOCK); + assign_pts_to_box3d<<>>(batch_size, pts_num, boxes_num, xyz, boxes3d, pts_assign); + + int *pts_idx = NULL; + cudaMalloc(&pts_idx, batch_size * boxes_num * sampled_pts_num * sizeof(int)); // (batch_size, M, sampled_pts_num) + + dim3 blocks2(DIVUP(boxes_num, THREADS_PER_BLOCK), batch_size); // blockIdx.x(col), blockIdx.y(row) + get_pooled_idx<<>>(batch_size, pts_num, boxes_num, sampled_pts_num, pts_assign, pts_idx, pooled_empty_flag); + + dim3 blocks_pool(DIVUP(sampled_pts_num, THREADS_PER_BLOCK), boxes_num, batch_size); + roipool3d_forward<<>>(batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, + xyz, pts_idx, pts_feature, pooled_features, pooled_empty_flag); + + cudaFree(pts_assign); + cudaFree(pts_idx); + +#ifdef DEBUG + cudaDeviceSynchronize(); // for using printf in kernel function +#endif +} diff --git a/mmdet3d/ops/sparse_block.py b/mmdet3d/ops/sparse_block.py index 52e343a18e..0539736342 100644 --- a/mmdet3d/ops/sparse_block.py +++ b/mmdet3d/ops/sparse_block.py @@ -14,12 +14,12 @@ class SparseBottleneck(Bottleneck, spconv.SparseModule): Args: inplanes (int): inplanes of block. planes (int): planes of block. - stride (int): stride of the first block. Default: 1 - downsample (None | Module): down sample module for block. - conv_cfg (dict): dictionary to construct and config conv layer. - Default: None - norm_cfg (dict): dictionary to construct and config norm layer. - Default: dict(type='BN') + stride (int, optional): stride of the first block. Default: 1. + downsample (Module, optional): down sample module for block. + conv_cfg (dict, optional): dictionary to construct and config conv + layer. Default: None. + norm_cfg (dict, optional): dictionary to construct and config norm + layer. Default: dict(type='BN'). """ expansion = 4 @@ -73,12 +73,12 @@ class SparseBasicBlock(BasicBlock, spconv.SparseModule): Args: inplanes (int): inplanes of block. planes (int): planes of block. - stride (int): stride of the first block. Default: 1 - downsample (None | Module): down sample module for block. - conv_cfg (dict): dictionary to construct and config conv layer. - Default: None - norm_cfg (dict): dictionary to construct and config norm layer. - Default: dict(type='BN') + stride (int, optional): stride of the first block. Default: 1. + downsample (Module, optional): down sample module for block. + conv_cfg (dict, optional): dictionary to construct and config conv + layer. Default: None. + norm_cfg (dict, optional): dictionary to construct and config norm + layer. Default: dict(type='BN'). """ expansion = 1 diff --git a/mmdet3d/ops/spconv/conv.py b/mmdet3d/ops/spconv/conv.py index 6a38f0e686..f6e19a1a5e 100644 --- a/mmdet3d/ops/spconv/conv.py +++ b/mmdet3d/ops/spconv/conv.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import math + import numpy as np import torch from mmcv.cnn import CONV_LAYERS @@ -143,16 +144,16 @@ def forward(self, input): out_tensor.indice_dict = input.indice_dict out_tensor.grid = input.grid return out_tensor - datas = input.find_indice_pair(self.indice_key) + data = input.find_indice_pair(self.indice_key) if self.inverse: - assert datas is not None and self.indice_key is not None - _, outids, indice_pairs, indice_pair_num, out_spatial_shape = datas + assert data is not None and self.indice_key is not None + _, outids, indice_pairs, indice_pair_num, out_spatial_shape = data assert indice_pairs.shape[0] == np.prod( self.kernel_size ), 'inverse conv must have same kernel size as its couple conv' else: - if self.indice_key is not None and datas is not None: - outids, _, indice_pairs, indice_pair_num, _ = datas + if self.indice_key is not None and data is not None: + outids, _, indice_pairs, indice_pair_num, _ = data else: outids, indice_pairs, indice_pair_num = ops.get_indice_pairs( indices, diff --git a/mmdet3d/ops/spconv/include/prettyprint.h b/mmdet3d/ops/spconv/include/prettyprint.h index 871c0fdd06..0a6bdc3361 100644 --- a/mmdet3d/ops/spconv/include/prettyprint.h +++ b/mmdet3d/ops/spconv/include/prettyprint.h @@ -93,7 +93,7 @@ struct delimiters { }; // Functor to print containers. You can use this directly if you want -// to specificy a non-default delimiters type. The printing logic can +// to specify a non-default delimiters type. The printing logic can // be customized by specializing the nested template. template 1: + omp_num_threads = 1 + warnings.warn( + f'Setting OMP_NUM_THREADS environment variable for each process ' + f'to be {omp_num_threads} in default, to avoid your system being ' + f'overloaded, please further tune the variable for optimal ' + f'performance in your application as needed.') + os.environ['OMP_NUM_THREADS'] = str(omp_num_threads) + + # setup MKL threads + if 'MKL_NUM_THREADS' not in os.environ and cfg.data.workers_per_gpu > 1: + mkl_num_threads = 1 + warnings.warn( + f'Setting MKL_NUM_THREADS environment variable for each process ' + f'to be {mkl_num_threads} in default, to avoid your system being ' + f'overloaded, please further tune the variable for optimal ' + f'performance in your application as needed.') + os.environ['MKL_NUM_THREADS'] = str(mkl_num_threads) diff --git a/mmdet3d/version.py b/mmdet3d/version.py index 6067966a77..9b89d23fe2 100644 --- a/mmdet3d/version.py +++ b/mmdet3d/version.py @@ -1,6 +1,6 @@ # Copyright (c) Open-MMLab. All rights reserved. -__version__ = '0.18.1' +__version__ = '1.0.0rc0' short_version = __version__ diff --git a/model-index.yml b/model-index.yml index 6292a8b4ff..091111b337 100644 --- a/model-index.yml +++ b/model-index.yml @@ -2,7 +2,6 @@ Import: - configs/3dssd/metafile.yml - configs/centerpoint/metafile.yml - configs/dynamic_voxelization/metafile.yml - - configs/fp16/metafile.yml - configs/free_anchor/metafile.yml - configs/h3dnet/metafile.yml - configs/imvotenet/metafile.yml @@ -17,3 +16,4 @@ Import: - configs/votenet/metafile.yml - configs/fcos3d/metafile.yml - configs/imvoxelnet/metafile.yml + - configs/pgd/metafile.yml diff --git a/requirements/runtime.txt b/requirements/runtime.txt index f0e7ae42e4..643cb0cdc0 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -1,8 +1,7 @@ lyft_dataset_sdk networkx>=2.2,<2.3 -# we may unlock the verion of numba in the future -numba==0.48.0 -numpy<1.20.0 +numba==0.53.0 +numpy nuscenes-devkit plyfile scikit-image diff --git a/resources/coord_sys_all.png b/resources/coord_sys_all.png new file mode 100644 index 0000000000..59f650fb2d Binary files /dev/null and b/resources/coord_sys_all.png differ diff --git a/setup.cfg b/setup.cfg index 51953f753b..f61734328a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -6,8 +6,11 @@ SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true [isort] line_length = 79 multi_line_output = 0 -known_standard_library = setuptools +extra_standard_library = setuptools known_first_party = mmdet,mmseg,mmdet3d -known_third_party = cv2,imageio,indoor3d_util,load_scannet_data,lyft_dataset_sdk,m2r,matplotlib,mmcv,nuimages,numba,numpy,nuscenes,pandas,plyfile,pycocotools,pyquaternion,pytest,pytorch_sphinx_theme,recommonmark,scannet_utils,scipy,seaborn,shapely,skimage,sphinx,tensorflow,terminaltables,torch,trimesh,waymo_open_dataset +known_third_party = cv2,imageio,indoor3d_util,load_scannet_data,lyft_dataset_sdk,m2r,matplotlib,mmcv,nuimages,numba,numpy,nuscenes,pandas,plyfile,pycocotools,pyquaternion,pytest,pytorch_sphinx_theme,recommonmark,requests,scannet_utils,scipy,seaborn,shapely,skimage,sphinx,tensorflow,terminaltables,torch,trimesh,ts,waymo_open_dataset no_lines_before = STDLIB,LOCALFOLDER default_section = THIRDPARTY + +[codespell] +ignore-words-list = ans,refridgerator,crate,hist,formating,dout,wan,nd,fo,avod,AVOD diff --git a/setup.py b/setup.py index 4856225c4e..4e896e2b82 100644 --- a/setup.py +++ b/setup.py @@ -1,12 +1,12 @@ -from setuptools import find_packages, setup - import os import platform import shutil import sys -import torch import warnings from os import path as osp +from setuptools import find_packages, setup + +import torch from torch.utils.cpp_extension import (BuildExtension, CppExtension, CUDAExtension) @@ -273,6 +273,11 @@ def add_mim_extention(): 'src/roiaware_pool3d_kernel.cu', 'src/points_in_boxes_cuda.cu', ]), + make_cuda_ext( + name='roipoint_pool3d_ext', + module='mmdet3d.ops.roipoint_pool3d', + sources=['src/roipoint_pool3d.cpp'], + sources_cuda=['src/roipoint_pool3d_kernel.cu']), make_cuda_ext( name='ball_query_ext', module='mmdet3d.ops.ball_query', diff --git a/tests/data/kitti/kitti_dbinfos_train.pkl b/tests/data/kitti/kitti_dbinfos_train.pkl index baa56c1a25..d9be817a13 100644 Binary files a/tests/data/kitti/kitti_dbinfos_train.pkl and b/tests/data/kitti/kitti_dbinfos_train.pkl differ diff --git a/tests/data/lyft/lyft_infos.pkl b/tests/data/lyft/lyft_infos.pkl index 5fa2a41d13..84295d1341 100644 Binary files a/tests/data/lyft/lyft_infos.pkl and b/tests/data/lyft/lyft_infos.pkl differ diff --git a/tests/data/lyft/lyft_infos_val.pkl b/tests/data/lyft/lyft_infos_val.pkl index 4ee7a7767d..30e61d7606 100644 Binary files a/tests/data/lyft/lyft_infos_val.pkl and b/tests/data/lyft/lyft_infos_val.pkl differ diff --git a/tests/data/lyft/sample_results.pkl b/tests/data/lyft/sample_results.pkl index c6509dfc4a..132baa4848 100644 Binary files a/tests/data/lyft/sample_results.pkl and b/tests/data/lyft/sample_results.pkl differ diff --git a/tests/data/sunrgbd/sunrgbd_infos.pkl b/tests/data/sunrgbd/sunrgbd_infos.pkl index 8f98f2fb19..c637abb9e4 100644 Binary files a/tests/data/sunrgbd/sunrgbd_infos.pkl and b/tests/data/sunrgbd/sunrgbd_infos.pkl differ diff --git a/tests/data/waymo/kitti_format/waymo_dbinfos_train.pkl b/tests/data/waymo/kitti_format/waymo_dbinfos_train.pkl index 1816e6a94f..a88b8703e4 100644 Binary files a/tests/data/waymo/kitti_format/waymo_dbinfos_train.pkl and b/tests/data/waymo/kitti_format/waymo_dbinfos_train.pkl differ diff --git a/tests/test_data/test_datasets/test_kitti_dataset.py b/tests/test_data/test_datasets/test_kitti_dataset.py index 6c3c9d7310..d204741de1 100644 --- a/tests/test_data/test_datasets/test_kitti_dataset.py +++ b/tests/test_data/test_datasets/test_kitti_dataset.py @@ -1,11 +1,13 @@ # Copyright (c) OpenMMLab. All rights reserved. -import numpy as np +import math import os -import pytest import tempfile + +import numpy as np +import pytest import torch -from mmdet3d.core.bbox import LiDARInstance3DBoxes +from mmdet3d.core.bbox import LiDARInstance3DBoxes, limit_period from mmdet3d.datasets import KittiDataset @@ -113,6 +115,7 @@ def test_getitem(): type='ObjectSample', db_sampler=dict( data_root='tests/data/kitti/', + # in coordinate system refactor, this test file is modified info_path='tests/data/kitti/kitti_dbinfos_train.pkl', rate=1.0, prepare=dict( @@ -151,8 +154,29 @@ def test_getitem(): gt_bboxes_3d = data['gt_bboxes_3d']._data gt_labels_3d = data['gt_labels_3d']._data expected_gt_bboxes_3d = torch.tensor( - [[9.5081, -5.2269, -1.1370, 0.4915, 1.2288, 1.9353, -2.7136]]) + [[9.5081, -5.2269, -1.1370, 1.2288, 0.4915, 1.9353, 1.9988]]) expected_gt_labels_3d = torch.tensor([0]) + rot_matrix = data['img_metas']._data['pcd_rotation'] + rot_angle = data['img_metas']._data['pcd_rotation_angle'] + horizontal_flip = data['img_metas']._data['pcd_horizontal_flip'] + vertical_flip = data['img_metas']._data['pcd_vertical_flip'] + expected_rot_matrix = torch.tensor([[0.8018, 0.5976, 0.0000], + [-0.5976, 0.8018, 0.0000], + [0.0000, 0.0000, 1.0000]]) + expected_rot_angle = 0.6404654291602163 + noise_angle = 0.20247319 + assert torch.allclose(expected_rot_matrix, rot_matrix, atol=1e-4) + assert math.isclose(expected_rot_angle, rot_angle, abs_tol=1e-4) + assert horizontal_flip is True + assert vertical_flip is False + + # after coord system refactor + expected_gt_bboxes_3d[:, :3] = \ + expected_gt_bboxes_3d[:, :3] @ rot_matrix @ rot_matrix + expected_gt_bboxes_3d[:, -1:] = -np.pi - expected_gt_bboxes_3d[:, -1:] \ + + 2 * rot_angle - 2 * noise_angle + expected_gt_bboxes_3d[:, -1:] = limit_period( + expected_gt_bboxes_3d[:, -1:], period=np.pi * 2) assert points.shape == (780, 4) assert torch.allclose( gt_bboxes_3d.tensor, expected_gt_bboxes_3d, atol=1e-4) @@ -232,9 +256,10 @@ def test_evaluate(): def test_show(): - import mmcv from os import path as osp + import mmcv + from mmdet3d.core.bbox import LiDARInstance3DBoxes tmp_dir = tempfile.TemporaryDirectory() temp_dir = tmp_dir.name @@ -346,9 +371,10 @@ def test_format_results(): pipeline, modality, split = _generate_kitti_dataset_config() kitti_dataset = KittiDataset(data_root, ann_file, split, pts_prefix, pipeline, classes, modality) + # coord system refactor boxes_3d = LiDARInstance3DBoxes( torch.tensor( - [[8.7314, -1.8559, -1.5997, 0.4800, 1.2000, 1.8900, 0.0100]])) + [[8.7314, -1.8559, -1.5997, 1.2000, 0.4800, 1.8900, -1.5808]])) labels_3d = torch.tensor([ 0, ]) @@ -359,21 +385,23 @@ def test_format_results(): expected_name = np.array(['Pedestrian']) expected_truncated = np.array([0.]) expected_occluded = np.array([0]) - expected_alpha = np.array([-3.3410306]) + # coord sys refactor + expected_alpha = np.array(-3.3410306 + np.pi) expected_bbox = np.array([[710.443, 144.00221, 820.29114, 307.58667]]) expected_dimensions = np.array([[1.2, 1.89, 0.48]]) expected_location = np.array([[1.8399826, 1.4700007, 8.410018]]) - expected_rotation_y = np.array([-3.1315928]) + expected_rotation_y = np.array([0.0100]) expected_score = np.array([0.5]) expected_sample_idx = np.array([0]) assert np.all(result_files[0]['name'] == expected_name) assert np.allclose(result_files[0]['truncated'], expected_truncated) assert np.all(result_files[0]['occluded'] == expected_occluded) - assert np.allclose(result_files[0]['alpha'], expected_alpha) + assert np.allclose(result_files[0]['alpha'], expected_alpha, 1e-3) assert np.allclose(result_files[0]['bbox'], expected_bbox) assert np.allclose(result_files[0]['dimensions'], expected_dimensions) assert np.allclose(result_files[0]['location'], expected_location) - assert np.allclose(result_files[0]['rotation_y'], expected_rotation_y) + assert np.allclose(result_files[0]['rotation_y'], expected_rotation_y, + 1e-3) assert np.allclose(result_files[0]['score'], expected_score) assert np.allclose(result_files[0]['sample_idx'], expected_sample_idx) tmp_dir.cleanup() @@ -386,7 +414,7 @@ def test_bbox2result_kitti(): pipeline, classes, modality) boxes_3d = LiDARInstance3DBoxes( torch.tensor( - [[8.7314, -1.8559, -1.5997, 0.4800, 1.2000, 1.8900, 0.0100]])) + [[8.7314, -1.8559, -1.5997, 1.2000, 0.4800, 1.8900, -1.5808]])) labels_3d = torch.tensor([ 0, ]) @@ -400,10 +428,11 @@ def test_bbox2result_kitti(): expected_file_path = os.path.join(temp_kitti_result_dir, '000000.txt') expected_name = np.array(['Pedestrian']) expected_dimensions = np.array([1.2000, 1.8900, 0.4800]) - expected_rotation_y = np.array([0.0100]) - np.pi + # coord system refactor (reverse sign) + expected_rotation_y = 0.0100 expected_score = np.array([0.5]) assert np.all(det_annos[0]['name'] == expected_name) - assert np.allclose(det_annos[0]['rotation_y'], expected_rotation_y) + assert np.allclose(det_annos[0]['rotation_y'], expected_rotation_y, 1e-3) assert np.allclose(det_annos[0]['score'], expected_score) assert np.allclose(det_annos[0]['dimensions'], expected_dimensions) assert os.path.exists(expected_file_path) diff --git a/tests/test_data/test_datasets/test_lyft_dataset.py b/tests/test_data/test_datasets/test_lyft_dataset.py index a226e50fc8..71008d486b 100644 --- a/tests/test_data/test_datasets/test_lyft_dataset.py +++ b/tests/test_data/test_datasets/test_lyft_dataset.py @@ -1,9 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. +import tempfile + import mmcv import numpy as np -import tempfile import torch +from mmdet3d.core import limit_period from mmdet3d.datasets import LyftDataset @@ -11,6 +13,7 @@ def test_getitem(): np.random.seed(0) torch.manual_seed(0) root_path = './tests/data/lyft' + # in coordinate system refactor, this test file is modified ann_file = './tests/data/lyft/lyft_infos.pkl' class_names = ('car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle', 'bicycle', 'pedestrian', 'animal') @@ -49,9 +52,11 @@ def test_getitem(): pcd_horizontal_flip = data['img_metas']._data['pcd_horizontal_flip'] pcd_scale_factor = data['img_metas']._data['pcd_scale_factor'] pcd_rotation = data['img_metas']._data['pcd_rotation'] + pcd_rotation_angle = data['img_metas']._data['pcd_rotation_angle'] sample_idx = data['img_metas']._data['sample_idx'] - pcd_rotation_expected = np.array([[0.99869376, -0.05109515, 0.], - [0.05109515, 0.99869376, 0.], + # coord sys refactor + pcd_rotation_expected = np.array([[0.99869376, 0.05109515, 0.], + [-0.05109515, 0.99869376, 0.], [0., 0., 1.]]) assert pts_filename == \ 'tests/data/lyft/lidar/host-a017_lidar1_1236118886901125926.bin' @@ -82,6 +87,21 @@ def test_getitem(): expected_gt_labels = np.array([0, 4, 7]) original_classes = lyft_dataset.CLASSES + # manually go through pipeline + expected_points[:, :3] = ( + (expected_points[:, :3] * torch.tensor([1, -1, 1])) + @ pcd_rotation_expected @ pcd_rotation_expected) * torch.tensor( + [1, -1, 1]) + expected_gt_bboxes_3d[:, :3] = ( + (expected_gt_bboxes_3d[:, :3] * torch.tensor([1, -1, 1])) + @ pcd_rotation_expected @ pcd_rotation_expected) * torch.tensor( + [1, -1, 1]) + expected_gt_bboxes_3d[:, 3:6] = expected_gt_bboxes_3d[:, [4, 3, 5]] + expected_gt_bboxes_3d[:, 6:] = -expected_gt_bboxes_3d[:, 6:] \ + - np.pi / 2 - pcd_rotation_angle * 2 + expected_gt_bboxes_3d[:, 6:] = limit_period( + expected_gt_bboxes_3d[:, 6:], period=np.pi * 2) + assert torch.allclose(points, expected_points, 1e-2) assert torch.allclose(gt_bboxes_3d.tensor, expected_gt_bboxes_3d, 1e-3) assert np.all(gt_labels_3d.numpy() == expected_gt_labels) @@ -110,8 +130,10 @@ def test_getitem(): def test_evaluate(): root_path = './tests/data/lyft' + # in coordinate system refactor, this test file is modified ann_file = './tests/data/lyft/lyft_infos_val.pkl' lyft_dataset = LyftDataset(ann_file, None, root_path) + # in coordinate system refactor, this test file is modified results = mmcv.load('./tests/data/lyft/sample_results.pkl') ap_dict = lyft_dataset.evaluate(results, 'bbox') car_precision = ap_dict['pts_bbox_Lyft/car_AP'] @@ -119,9 +141,10 @@ def test_evaluate(): def test_show(): - import mmcv from os import path as osp + import mmcv + from mmdet3d.core.bbox import LiDARInstance3DBoxes tmp_dir = tempfile.TemporaryDirectory() temp_dir = tmp_dir.name @@ -149,11 +172,11 @@ def test_show(): kitti_dataset = LyftDataset(ann_file, None, root_path) boxes_3d = LiDARInstance3DBoxes( torch.tensor( - [[46.1218, -4.6496, -0.9275, 0.5316, 1.4442, 1.7450, 1.1749], - [33.3189, 0.1981, 0.3136, 0.5656, 1.2301, 1.7985, 1.5723], - [46.1366, -4.6404, -0.9510, 0.5162, 1.6501, 1.7540, 1.3778], - [33.2646, 0.2297, 0.3446, 0.5746, 1.3365, 1.7947, 1.5430], - [58.9079, 16.6272, -1.5829, 1.5656, 3.9313, 1.4899, 1.5505]])) + [[46.1218, -4.6496, -0.9275, 1.4442, 0.5316, 1.7450, -2.7457], + [33.3189, 0.1981, 0.3136, 1.2301, 0.5656, 1.7985, 3.1401], + [46.1366, -4.6404, -0.9510, 1.6501, 0.5162, 1.7540, -2.9486], + [33.2646, 0.2297, 0.3446, 1.3365, 0.5746, 1.7947, -3.1138], + [58.9079, 16.6272, -1.5829, 3.9313, 1.5656, 1.4899, -3.1213]])) scores_3d = torch.tensor([0.1815, 0.1663, 0.5792, 0.2194, 0.2780]) labels_3d = torch.tensor([0, 0, 1, 1, 2]) result = dict(boxes_3d=boxes_3d, scores_3d=scores_3d, labels_3d=labels_3d) diff --git a/tests/test_data/test_datasets/test_nuscene_dataset.py b/tests/test_data/test_datasets/test_nuscene_dataset.py index e2e8b55fd9..f7b7656078 100644 --- a/tests/test_data/test_datasets/test_nuscene_dataset.py +++ b/tests/test_data/test_datasets/test_nuscene_dataset.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -import numpy as np import tempfile + +import numpy as np import torch from mmdet3d.datasets import NuScenesDataset @@ -65,9 +66,10 @@ def test_getitem(): def test_show(): - import mmcv from os import path as osp + import mmcv + from mmdet3d.core.bbox import LiDARInstance3DBoxes tmp_dir = tempfile.TemporaryDirectory() temp_dir = tmp_dir.name diff --git a/tests/test_data/test_datasets/test_nuscenes_mono_dataset.py b/tests/test_data/test_datasets/test_nuscenes_mono_dataset.py index f88775d2ef..d3a37f46ea 100644 --- a/tests/test_data/test_datasets/test_nuscenes_mono_dataset.py +++ b/tests/test_data/test_datasets/test_nuscenes_mono_dataset.py @@ -1,10 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. +import tempfile +from os import path as osp + import mmcv import numpy as np import pytest -import tempfile import torch -from os import path as osp from mmdet3d.datasets import NuScenesMonoDataset diff --git a/tests/test_data/test_datasets/test_s3dis_dataset.py b/tests/test_data/test_datasets/test_s3dis_dataset.py index 6d254bf473..eb63d8c384 100644 --- a/tests/test_data/test_datasets/test_s3dis_dataset.py +++ b/tests/test_data/test_datasets/test_s3dis_dataset.py @@ -239,10 +239,11 @@ def test_seg_evaluate(): def test_seg_show(): - import mmcv import tempfile from os import path as osp + import mmcv + tmp_dir = tempfile.TemporaryDirectory() temp_dir = tmp_dir.name root_path = './tests/data/s3dis' diff --git a/tests/test_data/test_datasets/test_scannet_dataset.py b/tests/test_data/test_datasets/test_scannet_dataset.py index e66f2b0020..91315ce45e 100644 --- a/tests/test_data/test_datasets/test_scannet_dataset.py +++ b/tests/test_data/test_datasets/test_scannet_dataset.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import copy + import numpy as np import pytest import torch @@ -200,10 +201,11 @@ def test_evaluate(): def test_show(): - import mmcv import tempfile from os import path as osp + import mmcv + from mmdet3d.core.bbox import DepthInstance3DBoxes tmp_dir = tempfile.TemporaryDirectory() temp_dir = tmp_dir.name @@ -581,10 +583,11 @@ def test_seg_evaluate(): def test_seg_show(): - import mmcv import tempfile from os import path as osp + import mmcv + tmp_dir = tempfile.TemporaryDirectory() temp_dir = tmp_dir.name root_path = './tests/data/scannet' @@ -652,9 +655,10 @@ def test_seg_show(): def test_seg_format_results(): - import mmcv from os import path as osp + import mmcv + root_path = './tests/data/scannet' ann_file = './tests/data/scannet/scannet_infos.pkl' scannet_dataset = ScanNetSegDataset( diff --git a/tests/test_data/test_datasets/test_sunrgbd_dataset.py b/tests/test_data/test_datasets/test_sunrgbd_dataset.py index e63d71f2cd..8cd6118cc8 100644 --- a/tests/test_data/test_datasets/test_sunrgbd_dataset.py +++ b/tests/test_data/test_datasets/test_sunrgbd_dataset.py @@ -8,6 +8,7 @@ def _generate_sunrgbd_dataset_config(): root_path = './tests/data/sunrgbd' + # in coordinate system refactor, this test file is modified ann_file = './tests/data/sunrgbd/sunrgbd_infos.pkl' class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser', 'night_stand', 'bookshelf', 'bathtub') @@ -120,6 +121,8 @@ def test_getitem(): [[0.8308, 4.1168, -1.2035, 2.2493, 1.8444, 1.9245, 1.6486], [2.3002, 4.8149, -1.2442, 0.5718, 0.8629, 0.9510, 1.6030], [-1.1477, 1.8090, -1.1725, 0.6965, 1.5273, 2.0563, 0.0552]]) + # coord sys refactor (rotation is correct but yaw has to be reversed) + expected_gt_bboxes_3d[:, 6:] = -expected_gt_bboxes_3d[:, 6:] expected_gt_labels = np.array([0, 7, 6]) original_classes = sunrgbd_dataset.CLASSES @@ -207,10 +210,11 @@ def test_evaluate(): def test_show(): - import mmcv import tempfile from os import path as osp + import mmcv + from mmdet3d.core.bbox import DepthInstance3DBoxes tmp_dir = tempfile.TemporaryDirectory() temp_dir = tmp_dir.name diff --git a/tests/test_data/test_datasets/test_waymo_dataset.py b/tests/test_data/test_datasets/test_waymo_dataset.py index 225e169a3f..302766dc0a 100644 --- a/tests/test_data/test_datasets/test_waymo_dataset.py +++ b/tests/test_data/test_datasets/test_waymo_dataset.py @@ -1,7 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. +import tempfile + import numpy as np import pytest -import tempfile import torch from mmdet3d.datasets import WaymoDataset @@ -16,6 +17,7 @@ def _generate_waymo_train_dataset_config(): file_client_args = dict(backend='disk') db_sampler = dict( data_root=data_root, + # in coordinate system refactor, this test file is modified info_path=data_root + 'waymo_dbinfos_train.pkl', rate=1.0, prepare=dict( @@ -114,7 +116,7 @@ def test_getitem(): gt_bboxes_3d = data['gt_bboxes_3d']._data gt_labels_3d = data['gt_labels_3d']._data expected_gt_bboxes_3d = torch.tensor( - [[31.4750, -4.5690, 2.1857, 2.3519, 6.0931, 3.1756, -1.2895]]) + [[31.8048, -0.1002, 2.1857, 6.0931, 2.3519, 3.1756, -0.1403]]) expected_gt_labels_3d = torch.tensor([0]) assert points.shape == (765, 5) assert torch.allclose( @@ -132,8 +134,8 @@ def test_evaluate(): pipeline, classes, modality) boxes_3d = LiDARInstance3DBoxes( torch.tensor([[ - 6.9684e+01, 3.3335e+01, 4.1465e-02, 2.0100e+00, 4.3600e+00, - 1.4600e+00, -9.0000e-02 + 6.9684e+01, 3.3335e+01, 4.1465e-02, 4.3600e+00, 2.0100e+00, + 1.4600e+00, 9.0000e-02 - np.pi / 2 ]])) labels_3d = torch.tensor([0]) scores_3d = torch.tensor([0.5]) @@ -150,8 +152,8 @@ def test_evaluate(): metric = ['waymo'] boxes_3d = LiDARInstance3DBoxes( torch.tensor([[ - 6.9684e+01, 3.3335e+01, 4.1465e-02, 2.0100e+00, 4.3600e+00, - 1.4600e+00, -9.0000e-02 + 6.9684e+01, 3.3335e+01, 4.1465e-02, 4.3600e+00, 2.0100e+00, + 1.4600e+00, 9.0000e-02 - np.pi / 2 ]])) labels_3d = torch.tensor([0]) scores_3d = torch.tensor([0.8]) @@ -164,9 +166,10 @@ def test_evaluate(): def test_show(): - import mmcv from os import path as osp + import mmcv + from mmdet3d.core.bbox import LiDARInstance3DBoxes # Waymo shares show function with KITTI so I just copy it here @@ -178,11 +181,11 @@ def test_show(): data_root, ann_file, split=split, modality=modality, pipeline=pipeline) boxes_3d = LiDARInstance3DBoxes( torch.tensor( - [[46.1218, -4.6496, -0.9275, 0.5316, 1.4442, 1.7450, 1.1749], - [33.3189, 0.1981, 0.3136, 0.5656, 1.2301, 1.7985, 1.5723], - [46.1366, -4.6404, -0.9510, 0.5162, 1.6501, 1.7540, 1.3778], - [33.2646, 0.2297, 0.3446, 0.5746, 1.3365, 1.7947, 1.5430], - [58.9079, 16.6272, -1.5829, 1.5656, 3.9313, 1.4899, 1.5505]])) + [[46.1218, -4.6496, -0.9275, 1.4442, 0.5316, 1.7450, 1.1749], + [33.3189, 0.1981, 0.3136, 1.2301, 0.5656, 1.7985, 1.5723], + [46.1366, -4.6404, -0.9510, 1.6501, 0.5162, 1.7540, 1.3778], + [33.2646, 0.2297, 0.3446, 1.3365, 0.5746, 1.7947, 1.5430], + [58.9079, 16.6272, -1.5829, 3.9313, 1.5656, 1.4899, 1.5505]])) scores_3d = torch.tensor([0.1815, 0.1663, 0.5792, 0.2194, 0.2780]) labels_3d = torch.tensor([0, 0, 1, 1, 2]) result = dict(boxes_3d=boxes_3d, scores_3d=scores_3d, labels_3d=labels_3d) @@ -231,8 +234,8 @@ def test_format_results(): pipeline, classes, modality) boxes_3d = LiDARInstance3DBoxes( torch.tensor([[ - 6.9684e+01, 3.3335e+01, 4.1465e-02, 2.0100e+00, 4.3600e+00, - 1.4600e+00, -9.0000e-02 + 6.9684e+01, 3.3335e+01, 4.1465e-02, 4.3600e+00, 2.0100e+00, + 1.4600e+00, 9.0000e-02 - np.pi / 2 ]])) labels_3d = torch.tensor([0]) scores_3d = torch.tensor([0.5]) @@ -252,11 +255,11 @@ def test_format_results(): assert np.all(result_files[0]['name'] == expected_name) assert np.allclose(result_files[0]['truncated'], expected_truncated) assert np.all(result_files[0]['occluded'] == expected_occluded) - assert np.allclose(result_files[0]['alpha'], expected_alpha) - assert np.allclose(result_files[0]['bbox'], expected_bbox) + assert np.allclose(result_files[0]['bbox'], expected_bbox, 1e-3) assert np.allclose(result_files[0]['dimensions'], expected_dimensions) assert np.allclose(result_files[0]['location'], expected_location) assert np.allclose(result_files[0]['rotation_y'], expected_rotation_y) assert np.allclose(result_files[0]['score'], expected_score) assert np.allclose(result_files[0]['sample_idx'], expected_sample_idx) + assert np.allclose(result_files[0]['alpha'], expected_alpha) tmp_dir.cleanup() diff --git a/tests/test_data/test_pipelines/test_augmentations/test_transforms_3d.py b/tests/test_data/test_pipelines/test_augmentations/test_transforms_3d.py index fffb6fe1b4..a12022213c 100644 --- a/tests/test_data/test_pipelines/test_augmentations/test_transforms_3d.py +++ b/tests/test_data/test_pipelines/test_augmentations/test_transforms_3d.py @@ -8,12 +8,14 @@ DepthInstance3DBoxes, LiDARInstance3DBoxes) from mmdet3d.core.bbox import Coord3DMode from mmdet3d.core.points import DepthPoints, LiDARPoints -from mmdet3d.datasets import (BackgroundPointsFilter, GlobalAlignment, - GlobalRotScaleTrans, ObjectNameFilter, - ObjectNoise, ObjectRangeFilter, ObjectSample, - PointSample, PointShuffle, PointsRangeFilter, - RandomDropPointsColor, RandomFlip3D, - RandomJitterPoints, VoxelBasedPointSampler) +# yapf: disable +from mmdet3d.datasets import (AffineResize, BackgroundPointsFilter, + GlobalAlignment, GlobalRotScaleTrans, + ObjectNameFilter, ObjectNoise, ObjectRangeFilter, + ObjectSample, PointSample, PointShuffle, + PointsRangeFilter, RandomDropPointsColor, + RandomFlip3D, RandomJitterPoints, + RandomShiftScale, VoxelBasedPointSampler) def test_remove_points_in_boxes(): @@ -132,8 +134,12 @@ def test_object_noise(): input_dict = object_noise(input_dict) points = input_dict['points'] gt_bboxes_3d = input_dict['gt_bboxes_3d'].tensor - expected_gt_bboxes_3d = torch.tensor( - [[9.1724, -1.7559, -1.3550, 0.4800, 1.2000, 1.8900, 0.0505]]) + + # coord sys refactor (lidar2cam) + expected_gt_bboxes_3d = torch.tensor([[ + 9.1724, -1.7559, -1.3550, 1.2000, 0.4800, 1.8900, + 0.0505 - float(rots) * 2 - np.pi / 2 + ]]) repr_str = repr(object_noise) expected_repr_str = 'ObjectNoise(num_try=100, ' \ 'translation_std=[0.25, 0.25, 0.25], ' \ @@ -522,11 +528,11 @@ def test_random_flip_3d(): [21.2334, -9.3607, -0.2588, 0.0000], [21.2179, -9.4372, -0.2598, 0.0000]]) expected_gt_bboxes_3d = torch.tensor( - [[38.9229, -18.4417, -1.1459, 0.7100, 1.7600, 1.8600, 5.4068], - [12.7768, -0.5795, -2.2682, 0.5700, 0.9900, 1.7200, 5.6445], - [12.7557, -2.2996, -1.4869, 0.6100, 1.1100, 1.9000, 5.0806], - [10.6677, -0.8064, -1.5435, 0.7900, 0.9600, 1.7900, 2.0560], - [5.0903, -5.1004, -1.2694, 0.7100, 1.7000, 1.8300, 5.0552]]) + [[38.9229, -18.4417, -1.1459, 0.7100, 1.7600, 1.8600, 2.2652], + [12.7768, -0.5795, -2.2682, 0.5700, 0.9900, 1.7200, 2.5029], + [12.7557, -2.2996, -1.4869, 0.6100, 1.1100, 1.9000, 1.9390], + [10.6677, -0.8064, -1.5435, 0.7900, 0.9600, 1.7900, -1.0856], + [5.0903, -5.1004, -1.2694, 0.7100, 1.7000, 1.8300, 1.9136]]) repr_str = repr(random_flip_3d) expected_repr_str = 'RandomFlip3D(sync_2d=True,' \ ' flip_ratio_bev_vertical=1.0)' @@ -751,3 +757,96 @@ def test_points_sample(): select_idx = np.array([449, 444]) expected_pts = points.tensor.numpy()[select_idx] assert np.allclose(sampled_pts.tensor.numpy(), expected_pts) + + +def test_affine_resize(): + + def create_random_bboxes(num_bboxes, img_w, img_h): + bboxes_left_top = np.random.uniform(0, 0.5, size=(num_bboxes, 2)) + bboxes_right_bottom = np.random.uniform(0.5, 1, size=(num_bboxes, 2)) + bboxes = np.concatenate((bboxes_left_top, bboxes_right_bottom), 1) + bboxes = (bboxes * np.array([img_w, img_h, img_w, img_h])).astype( + np.float32) + return bboxes + + affine_reszie = AffineResize(img_scale=(1290, 384), down_ratio=4) + + # test the situation: not use Random_Scale_Shift before AffineResize + results = dict() + img = mmcv.imread('./tests/data/kitti/training/image_2/000000.png', + 'color') + results['img'] = img + results['bbox_fields'] = ['gt_bboxes'] + results['bbox3d_fields'] = ['gt_bboxes_3d'] + + h, w, _ = img.shape + gt_bboxes = create_random_bboxes(8, w, h) + gt_bboxes_3d = CameraInstance3DBoxes(torch.randn((8, 7))) + results['gt_labels'] = np.ones(gt_bboxes.shape[0], dtype=np.int64) + results['gt_labels3d'] = results['gt_labels'] + results['gt_bboxes'] = gt_bboxes + results['gt_bboxes_3d'] = gt_bboxes_3d + results['depths'] = np.random.randn(gt_bboxes.shape[0]) + centers2d_x = (gt_bboxes[:, [0]] + gt_bboxes[:, [2]]) / 2 + centers2d_y = (gt_bboxes[:, [1]] + gt_bboxes[:, [3]]) / 2 + centers2d = np.concatenate((centers2d_x, centers2d_y), axis=1) + results['centers2d'] = centers2d + + results = affine_reszie(results) + + assert results['gt_labels'].shape[0] == results['centers2d'].shape[0] + assert results['gt_labels3d'].shape[0] == results['centers2d'].shape[0] + assert results['gt_bboxes'].shape[0] == results['centers2d'].shape[0] + assert results['gt_bboxes_3d'].tensor.shape[0] == \ + results['centers2d'].shape[0] + assert results['affine_aug'] is False + + # test the situation: not use Random_Scale_Shift before AffineResize + results = dict() + img = mmcv.imread('./tests/data/kitti/training/image_2/000000.png', + 'color') + results['img'] = img + results['bbox_fields'] = ['gt_bboxes'] + results['bbox3d_fields'] = ['gt_bboxes_3d'] + h, w, _ = img.shape + center = np.array([w / 2, h / 2], dtype=np.float32) + size = np.array([w, h], dtype=np.float32) + + results['center'] = center + results['size'] = size + results['affine_aug'] = False + + gt_bboxes = create_random_bboxes(8, w, h) + gt_bboxes_3d = CameraInstance3DBoxes(torch.randn((8, 7))) + results['gt_labels'] = np.ones(gt_bboxes.shape[0], dtype=np.int64) + results['gt_labels3d'] = results['gt_labels'] + results['gt_bboxes'] = gt_bboxes + results['gt_bboxes_3d'] = gt_bboxes_3d + results['depths'] = np.random.randn(gt_bboxes.shape[0]) + centers2d_x = (gt_bboxes[:, [0]] + gt_bboxes[:, [2]]) / 2 + centers2d_y = (gt_bboxes[:, [1]] + gt_bboxes[:, [3]]) / 2 + centers2d = np.concatenate((centers2d_x, centers2d_y), axis=1) + results['centers2d'] = centers2d + + results = affine_reszie(results) + + assert results['gt_labels'].shape[0] == results['centers2d'].shape[0] + assert results['gt_labels3d'].shape[0] == results['centers2d'].shape[0] + assert results['gt_bboxes'].shape[0] == results['centers2d'].shape[0] + assert results['gt_bboxes_3d'].tensor.shape[0] == results[ + 'centers2d'].shape[0] + assert 'center' in results + assert 'size' in results + assert 'affine_aug' in results + + +def test_random_shift_scale(): + random_shift_scale = RandomShiftScale(shift_scale=(0.2, 0.4), aug_prob=0.3) + results = dict() + img = mmcv.imread('./tests/data/kitti/training/image_2/000000.png', + 'color') + results['img'] = img + results = random_shift_scale(results) + assert results['center'].dtype == np.float32 + assert results['size'].dtype == np.float32 + assert 'affine_aug' in results diff --git a/tests/test_data/test_pipelines/test_indoor_pipeline.py b/tests/test_data/test_pipelines/test_indoor_pipeline.py index 0a59f1f220..5d27cf8746 100644 --- a/tests/test_data/test_pipelines/test_indoor_pipeline.py +++ b/tests/test_data/test_pipelines/test_indoor_pipeline.py @@ -1,8 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. +from os import path as osp + import mmcv import numpy as np import torch -from os import path as osp from mmdet3d.core.bbox import DepthInstance3DBoxes from mmdet3d.datasets.pipelines import Compose @@ -316,10 +317,24 @@ def test_sunrgbd_pipeline(): [0.8636, 1.3511, 0.0504, 0.0304], [0.8690, 1.3461, 0.1265, 0.1065], [0.8668, 1.3434, 0.1216, 0.1017]]) + # Depth coordinate system update: only yaw changes since rotation in depth + # is counter-clockwise and yaw angle is clockwise originally + # But heading angles in sunrgbd data also reverses the sign + # and after horizontal flip the sign reverse again + rotation_angle = info['annos']['rotation_y'] expected_gt_bboxes_3d = torch.tensor( - [[-1.2136, 4.0206, -0.2412, 2.2493, 1.8444, 1.9245, 1.3989], - [-2.7420, 4.5777, -0.7686, 0.5718, 0.8629, 0.9510, 1.4446], - [0.9729, 1.9087, -0.1443, 0.6965, 1.5273, 2.0563, 2.9924]]) + [[ + -1.2136, 4.0206, -0.2412, 2.2493, 1.8444, 1.9245, + 1.3989 + 0.047001579467984445 * 2 - 2 * rotation_angle[0] + ], + [ + -2.7420, 4.5777, -0.7686, 0.5718, 0.8629, 0.9510, + 1.4446 + 0.047001579467984445 * 2 - 2 * rotation_angle[1] + ], + [ + 0.9729, 1.9087, -0.1443, 0.6965, 1.5273, 2.0563, + 2.9924 + 0.047001579467984445 * 2 - 2 * rotation_angle[2] + ]]).float() expected_gt_labels_3d = np.array([0, 7, 6]) assert torch.allclose(gt_bboxes_3d.tensor, expected_gt_bboxes_3d, 1e-3) assert np.allclose(gt_labels_3d.flatten(), expected_gt_labels_3d) diff --git a/tests/test_data/test_pipelines/test_loadings/test_loading.py b/tests/test_data/test_pipelines/test_loadings/test_loading.py index f360da2748..ef0fc9ae8e 100644 --- a/tests/test_data/test_pipelines/test_loadings/test_loading.py +++ b/tests/test_data/test_pipelines/test_loadings/test_loading.py @@ -1,8 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. +from os import path as osp + import mmcv import numpy as np import pytest -from os import path as osp from mmdet3d.core.bbox import DepthInstance3DBoxes from mmdet3d.core.points import DepthPoints, LiDARPoints diff --git a/tests/test_data/test_pipelines/test_outdoor_pipeline.py b/tests/test_data/test_pipelines/test_outdoor_pipeline.py index fb6b9c1050..9d5a591893 100644 --- a/tests/test_data/test_pipelines/test_outdoor_pipeline.py +++ b/tests/test_data/test_pipelines/test_outdoor_pipeline.py @@ -38,63 +38,64 @@ def test_outdoor_aug_pipeline(): ] pipeline = Compose(train_pipeline) + # coord sys refactor: reverse sign of yaw gt_bboxes_3d = LiDARInstance3DBoxes( torch.tensor([ [ 2.16902428e+01, -4.06038128e-02, -1.61906636e+00, - 1.65999997e+00, 3.20000005e+00, 1.61000001e+00, -1.53999996e+00 + 1.65999997e+00, 3.20000005e+00, 1.61000001e+00, 1.53999996e+00 ], [ 7.05006886e+00, -6.57459593e+00, -1.60107934e+00, - 2.27999997e+00, 1.27799997e+01, 3.66000009e+00, 1.54999995e+00 + 2.27999997e+00, 1.27799997e+01, 3.66000009e+00, -1.54999995e+00 ], [ 2.24698811e+01, -6.69203758e+00, -1.50118136e+00, - 2.31999993e+00, 1.47299995e+01, 3.64000010e+00, 1.59000003e+00 + 2.31999993e+00, 1.47299995e+01, 3.64000010e+00, -1.59000003e+00 ], [ 3.48291969e+01, -7.09058380e+00, -1.36622977e+00, - 2.31999993e+00, 1.00400000e+01, 3.60999990e+00, 1.61000001e+00 + 2.31999993e+00, 1.00400000e+01, 3.60999990e+00, -1.61000001e+00 ], [ 4.62394600e+01, -7.75838804e+00, -1.32405007e+00, - 2.33999991e+00, 1.28299999e+01, 3.63000011e+00, 1.63999999e+00 + 2.33999991e+00, 1.28299999e+01, 3.63000011e+00, -1.63999999e+00 ], [ 2.82966995e+01, -5.55755794e-01, -1.30332506e+00, - 1.47000003e+00, 2.23000002e+00, 1.48000002e+00, -1.57000005e+00 + 1.47000003e+00, 2.23000002e+00, 1.48000002e+00, 1.57000005e+00 ], [ 2.66690197e+01, 2.18230209e+01, -1.73605704e+00, - 1.55999994e+00, 3.48000002e+00, 1.39999998e+00, -1.69000006e+00 + 1.55999994e+00, 3.48000002e+00, 1.39999998e+00, 1.69000006e+00 ], [ 3.13197803e+01, 8.16214371e+00, -1.62177873e+00, - 1.74000001e+00, 3.76999998e+00, 1.48000002e+00, 2.78999996e+00 + 1.74000001e+00, 3.76999998e+00, 1.48000002e+00, -2.78999996e+00 ], [ 4.34395561e+01, -1.95209332e+01, -1.20757008e+00, - 1.69000006e+00, 4.09999990e+00, 1.40999997e+00, -1.53999996e+00 + 1.69000006e+00, 4.09999990e+00, 1.40999997e+00, 1.53999996e+00 ], [ 3.29882965e+01, -3.79360509e+00, -1.69245458e+00, - 1.74000001e+00, 4.09000015e+00, 1.49000001e+00, -1.52999997e+00 + 1.74000001e+00, 4.09000015e+00, 1.49000001e+00, 1.52999997e+00 ], [ 3.85469360e+01, 8.35060215e+00, -1.31423414e+00, - 1.59000003e+00, 4.28000021e+00, 1.45000005e+00, 1.73000002e+00 + 1.59000003e+00, 4.28000021e+00, 1.45000005e+00, -1.73000002e+00 ], [ 2.22492104e+01, -1.13536005e+01, -1.38272512e+00, - 1.62000000e+00, 3.55999994e+00, 1.71000004e+00, 2.48000002e+00 + 1.62000000e+00, 3.55999994e+00, 1.71000004e+00, -2.48000002e+00 ], [ 3.36115799e+01, -1.97708054e+01, -4.92827654e-01, - 1.64999998e+00, 3.54999995e+00, 1.79999995e+00, -1.57000005e+00 + 1.64999998e+00, 3.54999995e+00, 1.79999995e+00, 1.57000005e+00 ], [ 9.85029602e+00, -1.51294518e+00, -1.66834795e+00, - 1.59000003e+00, 3.17000008e+00, 1.38999999e+00, -8.39999974e-01 + 1.59000003e+00, 3.17000008e+00, 1.38999999e+00, 8.39999974e-01 ] ], dtype=torch.float32)) @@ -105,23 +106,59 @@ def test_outdoor_aug_pipeline(): bbox3d_fields=[], img_fields=[]) + origin_center = gt_bboxes_3d.tensor[:, :3].clone() + origin_angle = gt_bboxes_3d.tensor[:, 6].clone() + output = pipeline(results) + # manually go through the pipeline + rotation_angle = output['img_metas']._data['pcd_rotation_angle'] + rotation_matrix = output['img_metas']._data['pcd_rotation'] + noise_angle = torch.tensor([ + 0.70853819, -0.19160091, -0.71116999, 0.49571753, -0.12447527, + -0.4690133, -0.34776965, -0.65692282, -0.52442831, -0.01575567, + -0.61849673, 0.6572608, 0.30312288, -0.19182971 + ]) + noise_trans = torch.tensor([[1.7641e+00, 4.0016e-01, 4.8937e-01], + [-1.3065e+00, 1.6581e+00, -5.9082e-02], + [-1.5504e+00, 4.1732e-01, -4.7218e-01], + [-5.2158e-01, -1.1847e+00, 4.8035e-01], + [-8.9637e-01, -1.9627e+00, 7.9241e-01], + [1.3240e-02, -1.2194e-01, 1.6953e-01], + [8.1798e-01, -2.7891e-01, 7.1578e-01], + [-4.1733e-04, 3.7416e-01, 2.0478e-01], + [1.5218e-01, -3.7413e-01, -6.7257e-03], + [-1.9138e+00, -2.2855e+00, -8.0092e-01], + [1.5933e+00, 5.6872e-01, -5.7244e-02], + [-1.8523e+00, -7.1333e-01, -8.8111e-01], + [5.2678e-01, 1.0106e-01, -1.9432e-01], + [-7.2449e-01, -8.0292e-01, -1.1334e-02]]) + angle = -origin_angle - noise_angle + torch.tensor(rotation_angle) + angle -= 2 * np.pi * (angle >= np.pi) + angle += 2 * np.pi * (angle < -np.pi) + scale = output['img_metas']._data['pcd_scale_factor'] + expected_tensor = torch.tensor( - [[20.6514, -8.8250, -1.0816, 1.5893, 3.0637, 1.5414, -1.9216], - [7.9374, 4.9457, -1.2008, 2.1829, 12.2357, 3.5041, 1.6629], - [20.8115, -2.0273, -1.8893, 2.2212, 14.1026, 3.4850, 2.6513], - [32.3850, -5.2135, -1.1321, 2.2212, 9.6124, 3.4562, 2.6498], - [43.7022, -7.8316, -0.5090, 2.2403, 12.2836, 3.4754, 2.0146], - [25.3300, -9.6670, -1.0855, 1.4074, 2.1350, 1.4170, -0.7141], - [16.5414, -29.0583, -0.9768, 1.4936, 3.3318, 1.3404, -0.7153], - [24.6548, -18.9226, -1.3567, 1.6659, 3.6094, 1.4170, 1.3970], - [45.8403, 1.8183, -1.1626, 1.6180, 3.9254, 1.3499, -0.6886], - [30.6288, -8.4497, -1.4881, 1.6659, 3.9158, 1.4265, -0.7241], - [32.3316, -22.4611, -1.3131, 1.5223, 4.0977, 1.3882, 2.4186], - [22.4492, 3.2944, -2.1674, 1.5510, 3.4084, 1.6372, 0.3928], - [37.3824, 5.0472, -0.6579, 1.5797, 3.3988, 1.7233, -1.4862], - [8.9259, -1.2578, -1.6081, 1.5223, 3.0350, 1.3308, -1.7212]]) + [[20.6514, -8.8250, -1.0816, 1.5893, 3.0637, 1.5414], + [7.9374, 4.9457, -1.2008, 2.1829, 12.2357, 3.5041], + [20.8115, -2.0273, -1.8893, 2.2212, 14.1026, 3.4850], + [32.3850, -5.2135, -1.1321, 2.2212, 9.6124, 3.4562], + [43.7022, -7.8316, -0.5090, 2.2403, 12.2836, 3.4754], + [25.3300, -9.6670, -1.0855, 1.4074, 2.1350, 1.4170], + [16.5414, -29.0583, -0.9768, 1.4936, 3.3318, 1.3404], + [24.6548, -18.9226, -1.3567, 1.6659, 3.6094, 1.4170], + [45.8403, 1.8183, -1.1626, 1.6180, 3.9254, 1.3499], + [30.6288, -8.4497, -1.4881, 1.6659, 3.9158, 1.4265], + [32.3316, -22.4611, -1.3131, 1.5223, 4.0977, 1.3882], + [22.4492, 3.2944, -2.1674, 1.5510, 3.4084, 1.6372], + [37.3824, 5.0472, -0.6579, 1.5797, 3.3988, 1.7233], + [8.9259, -1.2578, -1.6081, 1.5223, 3.0350, 1.3308]]) + + expected_tensor[:, :3] = (( + (origin_center + noise_trans) * torch.tensor([1, -1, 1])) + @ rotation_matrix) * scale + + expected_tensor = torch.cat([expected_tensor, angle.unsqueeze(-1)], dim=-1) assert torch.allclose( output['gt_bboxes_3d']._data.tensor, expected_tensor, atol=1e-3) @@ -208,6 +245,11 @@ def test_outdoor_velocity_aug_pipeline(): bbox3d_fields=[], img_fields=[]) + origin_center = gt_bboxes_3d.tensor[:, :3].clone() + origin_angle = gt_bboxes_3d.tensor[:, 6].clone( + ) # TODO: ObjectNoise modifies tensor!! + origin_velo = gt_bboxes_3d.tensor[:, 7:9].clone() + output = pipeline(results) expected_tensor = torch.tensor( @@ -247,5 +289,21 @@ def test_outdoor_velocity_aug_pipeline(): -4.4522e+00, -2.9166e+01, -7.8938e-01, 2.2841e+00, 3.8348e+00, 1.5925e+00, 1.4721e+00, -7.8371e-03, -8.1931e-03 ]]) + # coord sys refactor (manually go through pipeline) + rotation_angle = output['img_metas']._data['pcd_rotation_angle'] + rotation_matrix = output['img_metas']._data['pcd_rotation'] + expected_tensor[:, :3] = ((origin_center @ rotation_matrix) * + output['img_metas']._data['pcd_scale_factor'] * + torch.tensor([1, -1, 1]))[[ + 0, 1, 2, 3, 4, 5, 6, 7, 9 + ]] + angle = -origin_angle - rotation_angle + angle -= 2 * np.pi * (angle >= np.pi) + angle += 2 * np.pi * (angle < -np.pi) + expected_tensor[:, 6:7] = angle.unsqueeze(-1)[[0, 1, 2, 3, 4, 5, 6, 7, 9]] + expected_tensor[:, + 7:9] = ((origin_velo @ rotation_matrix[:2, :2]) * + output['img_metas']._data['pcd_scale_factor'] * + torch.tensor([1, -1]))[[0, 1, 2, 3, 4, 5, 6, 7, 9]] assert torch.allclose( output['gt_bboxes_3d']._data.tensor, expected_tensor, atol=1e-3) diff --git a/tests/test_metrics/test_kitti_eval.py b/tests/test_metrics/test_kitti_eval.py index 7405e6b0a5..7447cebaa6 100644 --- a/tests/test_metrics/test_kitti_eval.py +++ b/tests/test_metrics/test_kitti_eval.py @@ -83,31 +83,49 @@ def test_do_eval(): [[0.5, 0.5, 0.7], [0.25, 0.25, 0.5], [0.25, 0.25, 0.5]]]) eval_types = ['bbox', 'bev', '3d', 'aos'] - mAP_bbox, mAP_bev, mAP_3d, mAP_aos = do_eval([gt_anno], [dt_anno], + mAP11_bbox, mAP11_bev, mAP11_3d, mAP11_aos, mAP40_bbox,\ + mAP40_bev, mAP40_3d, mAP40_aos = do_eval([gt_anno], [dt_anno], current_classes, min_overlaps, eval_types) - expected_mAP_bbox = np.array([[[0., 0.], [9.09090909, 9.09090909], - [9.09090909, 9.09090909]], - [[0., 0.], [9.09090909, 9.09090909], - [9.09090909, 9.09090909]], - [[0., 0.], [9.09090909, 9.09090909], - [9.09090909, 9.09090909]]]) - expected_mAP_bev = np.array([[[0., 0.], [0., 0.], [0., 0.]], - [[0., 0.], [0., 0.], [0., 0.]], - [[0., 0.], [0., 0.], [0., 0.]]]) - expected_mAP_3d = np.array([[[0., 0.], [0., 0.], [0., 0.]], - [[0., 0.], [0., 0.], [0., 0.]], - [[0., 0.], [0., 0.], [0., 0.]]]) - expected_mAP_aos = np.array([[[0., 0.], [0.55020816, 0.55020816], - [0.55020816, 0.55020816]], - [[0., 0.], [8.36633862, 8.36633862], - [8.36633862, 8.36633862]], - [[0., 0.], [8.63476893, 8.63476893], - [8.63476893, 8.63476893]]]) - assert np.allclose(mAP_bbox, expected_mAP_bbox) - assert np.allclose(mAP_bev, expected_mAP_bev) - assert np.allclose(mAP_3d, expected_mAP_3d) - assert np.allclose(mAP_aos, expected_mAP_aos) + expected_mAP11_bbox = np.array([[[0., 0.], [9.09090909, 9.09090909], + [9.09090909, 9.09090909]], + [[0., 0.], [9.09090909, 9.09090909], + [9.09090909, 9.09090909]], + [[0., 0.], [9.09090909, 9.09090909], + [9.09090909, 9.09090909]]]) + expected_mAP40_bbox = np.array([[[0., 0.], [0., 0.], [0., 0.]], + [[0., 0.], [0., 0.], [0., 0.]], + [[0., 0.], [2.5, 2.5], [2.5, 2.5]]]) + expected_mAP11_bev = np.array([[[0., 0.], [0., 0.], [0., 0.]], + [[0., 0.], [0., 0.], [0., 0.]], + [[0., 0.], [0., 0.], [0., 0.]]]) + expected_mAP40_bev = np.array([[[0., 0.], [0., 0.], [0., 0.]], + [[0., 0.], [0., 0.], [0., 0.]], + [[0., 0.], [0., 0.], [0., 0.]]]) + expected_mAP11_3d = np.array([[[0., 0.], [0., 0.], [0., 0.]], + [[0., 0.], [0., 0.], [0., 0.]], + [[0., 0.], [0., 0.], [0., 0.]]]) + expected_mAP40_3d = np.array([[[0., 0.], [0., 0.], [0., 0.]], + [[0., 0.], [0., 0.], [0., 0.]], + [[0., 0.], [0., 0.], [0., 0.]]]) + expected_mAP11_aos = np.array([[[0., 0.], [0.55020816, 0.55020816], + [0.55020816, 0.55020816]], + [[0., 0.], [8.36633862, 8.36633862], + [8.36633862, 8.36633862]], + [[0., 0.], [8.63476893, 8.63476893], + [8.63476893, 8.63476893]]]) + expected_mAP40_aos = np.array([[[0., 0.], [0., 0.], [0., 0.]], + [[0., 0.], [0., 0.], [0., 0.]], + [[0., 0.], [1.58140643, 1.58140643], + [1.58140643, 1.58140643]]]) + assert np.allclose(mAP11_bbox, expected_mAP11_bbox) + assert np.allclose(mAP11_bev, expected_mAP11_bev) + assert np.allclose(mAP11_3d, expected_mAP11_3d) + assert np.allclose(mAP11_aos, expected_mAP11_aos) + assert np.allclose(mAP40_bbox, expected_mAP40_bbox) + assert np.allclose(mAP40_bev, expected_mAP40_bev) + assert np.allclose(mAP40_3d, expected_mAP40_3d) + assert np.allclose(mAP40_aos, expected_mAP40_aos) def test_kitti_eval(): @@ -183,8 +201,14 @@ def test_kitti_eval(): current_classes = [1, 2, 0] result, ret_dict = kitti_eval([gt_anno], [dt_anno], current_classes) - assert np.isclose(ret_dict['KITTI/Overall_2D_moderate'], 9.090909090909092) - assert np.isclose(ret_dict['KITTI/Overall_2D_hard'], 9.090909090909092) + assert np.isclose(ret_dict['KITTI/Overall_2D_AP11_moderate'], + 9.090909090909092) + assert np.isclose(ret_dict['KITTI/Overall_2D_AP11_hard'], + 9.090909090909092) + assert np.isclose(ret_dict['KITTI/Overall_2D_AP40_moderate'], + 0.8333333333333334) + assert np.isclose(ret_dict['KITTI/Overall_2D_AP40_hard'], + 0.8333333333333334) def test_eval_class(): diff --git a/tests/test_metrics/test_losses.py b/tests/test_metrics/test_losses.py index 08cdb6275c..a70834c199 100644 --- a/tests/test_metrics/test_losses.py +++ b/tests/test_metrics/test_losses.py @@ -3,6 +3,8 @@ import torch from torch import nn as nn +from mmdet.models import build_loss + def test_chamfer_disrance(): from mmdet3d.models.losses import ChamferDistance, chamfer_distance @@ -93,7 +95,7 @@ def __init__(self): set_random_seed(0, True) model = ToyModel() - # reduction shoule be in ['none', 'mean', 'sum'] + # reduction should be in ['none', 'mean', 'sum'] with pytest.raises(AssertionError): paconv_corr_loss = PAConvRegularizationLoss(reduction='l2') @@ -109,3 +111,64 @@ def __init__(self): model.modules(), reduction_override='none') assert none_corr_loss.shape[0] == 3 assert torch.allclose(none_corr_loss.mean(), mean_corr_loss) + + +def test_uncertain_smooth_l1_loss(): + from mmdet3d.models.losses import UncertainL1Loss, UncertainSmoothL1Loss + + # reduction should be in ['none', 'mean', 'sum'] + with pytest.raises(AssertionError): + uncertain_l1_loss = UncertainL1Loss(reduction='l2') + with pytest.raises(AssertionError): + uncertain_smooth_l1_loss = UncertainSmoothL1Loss(reduction='l2') + + pred = torch.tensor([1.5783, 0.5972, 1.4821, 0.9488]) + target = torch.tensor([1.0813, -0.3466, -1.1404, -0.9665]) + sigma = torch.tensor([-1.0053, 0.4710, -1.7784, -0.8603]) + + # test uncertain l1 loss + uncertain_l1_loss_cfg = dict( + type='UncertainL1Loss', alpha=1.0, reduction='mean', loss_weight=1.0) + uncertain_l1_loss = build_loss(uncertain_l1_loss_cfg) + mean_l1_loss = uncertain_l1_loss(pred, target, sigma) + expected_l1_loss = torch.tensor(4.7069) + assert torch.allclose(mean_l1_loss, expected_l1_loss, atol=1e-4) + + # test uncertain smooth l1 loss + uncertain_smooth_l1_loss_cfg = dict( + type='UncertainSmoothL1Loss', + alpha=1.0, + beta=0.5, + reduction='mean', + loss_weight=1.0) + uncertain_smooth_l1_loss = build_loss(uncertain_smooth_l1_loss_cfg) + mean_smooth_l1_loss = uncertain_smooth_l1_loss(pred, target, sigma) + expected_smooth_l1_loss = torch.tensor(3.9795) + assert torch.allclose( + mean_smooth_l1_loss, expected_smooth_l1_loss, atol=1e-4) + + +def test_multibin_loss(): + from mmdet3d.models.losses import MultiBinLoss + + # reduction should be in ['none', 'mean', 'sum'] + with pytest.raises(AssertionError): + multibin_loss = MultiBinLoss(reduction='l2') + + pred = torch.tensor([[ + 0.81, 0.32, 0.78, 0.52, 0.24, 0.12, 0.32, 0.11, 1.20, 1.30, 0.20, 0.11, + 0.12, 0.11, 0.23, 0.31 + ], + [ + 0.02, 0.19, 0.78, 0.22, 0.31, 0.12, 0.22, 0.11, + 1.20, 1.30, 0.45, 0.51, 0.12, 0.11, 0.13, 0.61 + ]]) + target = torch.tensor([[1, 1, 0, 0, 2.14, 3.12, 0.68, -2.15], + [1, 1, 0, 0, 3.12, 3.12, 2.34, 1.23]]) + multibin_loss_cfg = dict( + type='MultiBinLoss', reduction='none', loss_weight=1.0) + multibin_loss = build_loss(multibin_loss_cfg) + output_multibin_loss = multibin_loss(pred, target, num_dir_bins=4) + expected_multibin_loss = torch.tensor(2.1120) + assert torch.allclose( + output_multibin_loss, expected_multibin_loss, atol=1e-4) diff --git a/tests/test_models/test_backbones.py b/tests/test_models/test_backbones.py index 6a3d2cb422..392e0ec4c0 100644 --- a/tests/test_models/test_backbones.py +++ b/tests/test_models/test_backbones.py @@ -297,3 +297,59 @@ def test_pointnet2_sa_msg(): assert sa_indices[2].shape == torch.Size([1, 256]) assert sa_indices[3].shape == torch.Size([1, 64]) assert sa_indices[4].shape == torch.Size([1, 16]) + + +def test_dgcnn_gf(): + if not torch.cuda.is_available(): + pytest.skip() + + # DGCNNGF used in segmentation + cfg = dict( + type='DGCNNBackbone', + in_channels=6, + num_samples=(20, 20, 20), + knn_modes=['D-KNN', 'F-KNN', 'F-KNN'], + radius=(None, None, None), + gf_channels=((64, 64), (64, 64), (64, )), + fa_channels=(1024, ), + act_cfg=dict(type='ReLU')) + + self = build_backbone(cfg) + self.cuda() + + xyz = np.fromfile('tests/data/sunrgbd/points/000001.bin', dtype=np.float32) + xyz = torch.from_numpy(xyz).view(1, -1, 6).cuda() # (B, N, 6) + # test forward + ret_dict = self(xyz) + gf_points = ret_dict['gf_points'] + fa_points = ret_dict['fa_points'] + + assert len(gf_points) == 4 + assert gf_points[0].shape == torch.Size([1, 100, 6]) + assert gf_points[1].shape == torch.Size([1, 100, 64]) + assert gf_points[2].shape == torch.Size([1, 100, 64]) + assert gf_points[3].shape == torch.Size([1, 100, 64]) + assert fa_points.shape == torch.Size([1, 100, 1216]) + + +def test_dla_net(): + # test DLANet used in SMOKE + # test list config + cfg = dict( + type='DLANet', + depth=34, + in_channels=3, + norm_cfg=dict(type='GN', num_groups=32)) + + img = torch.randn((4, 3, 32, 32)) + self = build_backbone(cfg) + self.init_weights() + + results = self(img) + assert len(results) == 6 + assert results[0].shape == torch.Size([4, 16, 32, 32]) + assert results[1].shape == torch.Size([4, 32, 16, 16]) + assert results[2].shape == torch.Size([4, 64, 8, 8]) + assert results[3].shape == torch.Size([4, 128, 4, 4]) + assert results[4].shape == torch.Size([4, 256, 2, 2]) + assert results[5].shape == torch.Size([4, 512, 1, 1]) diff --git a/tests/test_models/test_common_modules/test_dgcnn_modules.py b/tests/test_models/test_common_modules/test_dgcnn_modules.py new file mode 100644 index 0000000000..031971b459 --- /dev/null +++ b/tests/test_models/test_common_modules/test_dgcnn_modules.py @@ -0,0 +1,92 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import pytest +import torch + + +def test_dgcnn_gf_module(): + if not torch.cuda.is_available(): + pytest.skip() + from mmdet3d.ops import DGCNNGFModule + + self = DGCNNGFModule( + mlp_channels=[18, 64, 64], + num_sample=20, + knn_mod='D-KNN', + radius=None, + norm_cfg=dict(type='BN2d'), + act_cfg=dict(type='ReLU'), + pool_mod='max').cuda() + + assert self.mlps[0].layer0.conv.in_channels == 18 + assert self.mlps[0].layer0.conv.out_channels == 64 + + xyz = np.fromfile('tests/data/sunrgbd/points/000001.bin', np.float32) + + # (B, N, C) + xyz = torch.from_numpy(xyz).view(1, -1, 3).cuda() + points = xyz.repeat([1, 1, 3]) + + # test forward + new_points = self(points) + + assert new_points.shape == torch.Size([1, 200, 64]) + + # test F-KNN mod + self = DGCNNGFModule( + mlp_channels=[6, 64, 64], + num_sample=20, + knn_mod='F-KNN', + radius=None, + norm_cfg=dict(type='BN2d'), + act_cfg=dict(type='ReLU'), + pool_mod='max').cuda() + + # test forward + new_points = self(xyz) + assert new_points.shape == torch.Size([1, 200, 64]) + + # test ball query + self = DGCNNGFModule( + mlp_channels=[6, 64, 64], + num_sample=20, + knn_mod='F-KNN', + radius=0.2, + norm_cfg=dict(type='BN2d'), + act_cfg=dict(type='ReLU'), + pool_mod='max').cuda() + + +def test_dgcnn_fa_module(): + if not torch.cuda.is_available(): + pytest.skip() + from mmdet3d.ops import DGCNNFAModule + + self = DGCNNFAModule(mlp_channels=[24, 16]).cuda() + assert self.mlps.layer0.conv.in_channels == 24 + assert self.mlps.layer0.conv.out_channels == 16 + + points = [torch.rand(1, 200, 12).float().cuda() for _ in range(3)] + + fa_points = self(points) + assert fa_points.shape == torch.Size([1, 200, 40]) + + +def test_dgcnn_fp_module(): + if not torch.cuda.is_available(): + pytest.skip() + from mmdet3d.ops import DGCNNFPModule + + self = DGCNNFPModule(mlp_channels=[24, 16]).cuda() + assert self.mlps.layer0.conv.in_channels == 24 + assert self.mlps.layer0.conv.out_channels == 16 + + xyz = np.fromfile('tests/data/sunrgbd/points/000001.bin', + np.float32).reshape((-1, 6)) + + # (B, N, 3) + xyz = torch.from_numpy(xyz).view(1, -1, 3).cuda() + points = xyz.repeat([1, 1, 8]).cuda() + + fp_points = self(points) + assert fp_points.shape == torch.Size([1, 200, 16]) diff --git a/tests/test_models/test_common_modules/test_pointnet_modules.py b/tests/test_models/test_common_modules/test_pointnet_modules.py index a3a96cbea7..66c21b19d6 100644 --- a/tests/test_models/test_common_modules/test_pointnet_modules.py +++ b/tests/test_models/test_common_modules/test_pointnet_modules.py @@ -108,6 +108,20 @@ def test_pointnet_sa_module_msg(): assert new_features.shape == torch.Size([1, 48, 20]) assert inds.shape == torch.Size([1, 20]) + # test num_points = None + self = PointSAModuleMSG( + num_point=None, + radii=[0.2, 0.4], + sample_nums=[4, 8], + mlp_channels=[[12, 16], [12, 32]], + norm_cfg=dict(type='BN2d'), + use_xyz=False, + pool_mod='max').cuda() + + # test forward + new_xyz, new_features, inds = self(xyz, features) + assert new_features.shape == torch.Size([1, 48, 1]) + # length of 'fps_mod' should be same as 'fps_sample_range_list' with pytest.raises(AssertionError): PointSAModuleMSG( diff --git a/tests/test_models/test_common_modules/test_pointnet_ops.py b/tests/test_models/test_common_modules/test_pointnet_ops.py index ec3a581e63..30ad69b5d6 100644 --- a/tests/test_models/test_common_modules/test_pointnet_ops.py +++ b/tests/test_models/test_common_modules/test_pointnet_ops.py @@ -2,16 +2,9 @@ import pytest import torch -from mmdet3d.ops import ( - ball_query, - furthest_point_sample, - furthest_point_sample_with_dist, - gather_points, - grouping_operation, - knn, - three_interpolate, - three_nn, -) +from mmdet3d.ops import (ball_query, furthest_point_sample, + furthest_point_sample_with_dist, gather_points, + grouping_operation, knn, three_interpolate, three_nn) def test_fps(): diff --git a/tests/test_models/test_common_modules/test_roiaware_pool3d.py b/tests/test_models/test_common_modules/test_roiaware_pool3d.py index c36bc3b26a..90559dc976 100644 --- a/tests/test_models/test_common_modules/test_roiaware_pool3d.py +++ b/tests/test_models/test_common_modules/test_roiaware_pool3d.py @@ -1,10 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. +import numpy as np import pytest import torch -from mmdet3d.ops.roiaware_pool3d import (RoIAwarePool3d, points_in_boxes_batch, +from mmdet3d.ops.roiaware_pool3d import (RoIAwarePool3d, points_in_boxes_all, points_in_boxes_cpu, - points_in_boxes_gpu) + points_in_boxes_part) def test_RoIAwarePool3d(): @@ -16,8 +17,8 @@ def test_RoIAwarePool3d(): roiaware_pool3d_avg = RoIAwarePool3d( out_size=4, max_pts_per_voxel=128, mode='avg') rois = torch.tensor( - [[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3], - [-10.0, 23.0, 16.0, 10, 20, 20, 0.5]], + [[1.0, 2.0, 3.0, 5.0, 4.0, 6.0, -0.3 - np.pi / 2], + [-10.0, 23.0, 16.0, 20.0, 10.0, 20.0, -0.5 - np.pi / 2]], dtype=torch.float32).cuda( ) # boxes (m, 7) with bottom center in lidar coordinate pts = torch.tensor( @@ -41,7 +42,7 @@ def test_RoIAwarePool3d(): torch.tensor(49.750).cuda(), 1e-3) -def test_points_in_boxes_gpu(): +def test_points_in_boxes_part(): if not torch.cuda.is_available(): pytest.skip('test requires GPU and torch+cuda') boxes = torch.tensor( @@ -57,45 +58,68 @@ def test_points_in_boxes_gpu(): [0, 0, 0], [6, 7, 8], [-2, -3, -4], [6, 4, 9]]], dtype=torch.float32).cuda() # points (b, m, 3) in lidar coordinate - point_indices = points_in_boxes_gpu(points=pts, boxes=boxes) + point_indices = points_in_boxes_part(points=pts, boxes=boxes) expected_point_indices = torch.tensor( [[0, 0, 0, 0, 0, -1, -1, -1], [-1, -1, -1, -1, -1, -1, -1, -1]], dtype=torch.int32).cuda() assert point_indices.shape == torch.Size([2, 8]) assert (point_indices == expected_point_indices).all() + boxes = torch.tensor([[[0.0, 0.0, 0.0, 1.0, 20.0, 1.0, 0.523598]]], + dtype=torch.float32).cuda() # 30 degrees + pts = torch.tensor( + [[[4, 6.928, 0], [6.928, 4, 0], [4, -6.928, 0], [6.928, -4, 0], + [-4, 6.928, 0], [-6.928, 4, 0], [-4, -6.928, 0], [-6.928, -4, 0]]], + dtype=torch.float32).cuda() + point_indices = points_in_boxes_part(points=pts, boxes=boxes) + expected_point_indices = torch.tensor([[-1, -1, 0, -1, 0, -1, -1, -1]], + dtype=torch.int32).cuda() + assert (point_indices == expected_point_indices).all() + if torch.cuda.device_count() > 1: pts = pts.to('cuda:1') boxes = boxes.to('cuda:1') expected_point_indices = expected_point_indices.to('cuda:1') - point_indices = points_in_boxes_gpu(points=pts, boxes=boxes) + point_indices = points_in_boxes_part(points=pts, boxes=boxes) assert point_indices.shape == torch.Size([2, 8]) assert (point_indices == expected_point_indices).all() def test_points_in_boxes_cpu(): boxes = torch.tensor( - [[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3], - [-10.0, 23.0, 16.0, 10, 20, 20, 0.5]], + [[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3], + [-10.0, 23.0, 16.0, 10, 20, 20, 0.5]]], dtype=torch.float32 ) # boxes (m, 7) with bottom center in lidar coordinate pts = torch.tensor( - [[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6], - [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3], - [4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [-16, -18, 9], - [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]], + [[[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6], + [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3], + [4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [ + -16, -18, 9 + ], [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]]], dtype=torch.float32) # points (n, 3) in lidar coordinate point_indices = points_in_boxes_cpu(points=pts, boxes=boxes) expected_point_indices = torch.tensor( - [[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]], + [[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [0, 1], [0, 0], [0, 0], + [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]], dtype=torch.int32) - assert point_indices.shape == torch.Size([2, 15]) + assert point_indices.shape == torch.Size([1, 15, 2]) + assert (point_indices == expected_point_indices).all() + + boxes = torch.tensor([[[0.0, 0.0, 0.0, 1.0, 20.0, 1.0, 0.523598]]], + dtype=torch.float32) # 30 degrees + pts = torch.tensor( + [[[4, 6.928, 0], [6.928, 4, 0], [4, -6.928, 0], [6.928, -4, 0], + [-4, 6.928, 0], [-6.928, 4, 0], [-4, -6.928, 0], [-6.928, -4, 0]]], + dtype=torch.float32) + point_indices = points_in_boxes_cpu(points=pts, boxes=boxes) + expected_point_indices = torch.tensor( + [[[0], [0], [1], [0], [1], [0], [0], [0]]], dtype=torch.int32) assert (point_indices == expected_point_indices).all() -def test_points_in_boxes_batch(): +def test_points_in_boxes_all(): if not torch.cuda.is_available(): pytest.skip('test requires GPU and torch+cuda') @@ -112,7 +136,7 @@ def test_points_in_boxes_batch(): ], [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]]], dtype=torch.float32).cuda() # points (n, 3) in lidar coordinate - point_indices = points_in_boxes_batch(points=pts, boxes=boxes) + point_indices = points_in_boxes_all(points=pts, boxes=boxes) expected_point_indices = torch.tensor( [[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [0, 1], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]], @@ -124,6 +148,6 @@ def test_points_in_boxes_batch(): pts = pts.to('cuda:1') boxes = boxes.to('cuda:1') expected_point_indices = expected_point_indices.to('cuda:1') - point_indices = points_in_boxes_batch(points=pts, boxes=boxes) + point_indices = points_in_boxes_all(points=pts, boxes=boxes) assert point_indices.shape == torch.Size([1, 15, 2]) assert (point_indices == expected_point_indices).all() diff --git a/tests/test_models/test_detectors.py b/tests/test_models/test_detectors.py index dc7a70812b..815627fc15 100644 --- a/tests/test_models/test_detectors.py +++ b/tests/test_models/test_detectors.py @@ -1,10 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. import copy +import random +from os.path import dirname, exists, join + import numpy as np import pytest -import random import torch -from os.path import dirname, exists, join from mmdet3d.core.bbox import (CameraInstance3DBoxes, DepthInstance3DBoxes, LiDARInstance3DBoxes) @@ -437,7 +438,8 @@ def test_imvoxelnet(): if not torch.cuda.is_available(): pytest.skip('test requires GPU and torch+cuda') - imvoxelnet_cfg = _get_detector_cfg('imvoxelnet/imvoxelnet_kitti-3d-car.py') + imvoxelnet_cfg = _get_detector_cfg( + 'imvoxelnet/imvoxelnet_4x8_kitti-3d-car.py') self = build_detector(imvoxelnet_cfg).cuda() imgs = torch.rand([1, 3, 384, 1280], dtype=torch.float32).cuda() gt_bboxes_3d = [LiDARInstance3DBoxes(torch.rand([3, 7], device='cuda'))] @@ -469,3 +471,100 @@ def test_imvoxelnet(): assert boxes_3d.tensor.shape[1] == 7 assert scores_3d.shape[0] >= 0 assert labels_3d.shape[0] >= 0 + + +def test_pointrcnn(): + if not torch.cuda.is_available(): + pytest.skip('test requires GPU and torch+cuda') + pointrcnn_cfg = _get_detector_cfg( + 'pointrcnn/pointrcnn_2x8_kitti-3d-3classes.py') + self = build_detector(pointrcnn_cfg).cuda() + points_0 = torch.rand([1000, 4], device='cuda') + points_1 = torch.rand([1000, 4], device='cuda') + points = [points_0, points_1] + + img_meta_0 = dict(box_type_3d=LiDARInstance3DBoxes) + img_meta_1 = dict(box_type_3d=LiDARInstance3DBoxes) + img_metas = [img_meta_0, img_meta_1] + gt_bbox_0 = LiDARInstance3DBoxes(torch.rand([10, 7], device='cuda')) + gt_bbox_1 = LiDARInstance3DBoxes(torch.rand([10, 7], device='cuda')) + gt_bboxes = [gt_bbox_0, gt_bbox_1] + gt_labels_0 = torch.randint(0, 3, [10], device='cuda') + gt_labels_1 = torch.randint(0, 3, [10], device='cuda') + gt_labels = [gt_labels_0, gt_labels_1] + + # test_forward_train + losses = self.forward_train(points, img_metas, gt_bboxes, gt_labels) + assert losses['bbox_loss'] >= 0 + assert losses['semantic_loss'] >= 0 + assert losses['loss_cls'] >= 0 + assert losses['loss_bbox'] >= 0 + assert losses['loss_corner'] >= 0 + + +def test_smoke(): + if not torch.cuda.is_available(): + pytest.skip('test requires GPU and torch+cuda') + + _setup_seed(0) + smoke_cfg = _get_detector_cfg( + 'smoke/smoke_dla34_pytorch_dlaneck_gn-all_8x4_6x_kitti-mono3d.py') + self = build_detector(smoke_cfg).cuda() + imgs = torch.rand([1, 3, 384, 1280], dtype=torch.float32).cuda() + gt_bboxes = [ + torch.Tensor([[563.63122442, 175.02195182, 614.81298184, 224.97763099], + [480.89676358, 179.86272635, 511.53017463, 202.54645962], + [541.48322272, 175.73767011, 564.55208966, 193.95009791], + [329.51448848, 176.14566789, 354.24670848, + 213.82599081]]).cuda() + ] + gt_bboxes_3d = [ + CameraInstance3DBoxes( + torch.Tensor([[-0.69, 1.69, 25.01, 3.20, 1.61, 1.66, -1.59], + [-7.43, 1.88, 47.55, 3.70, 1.40, 1.51, 1.55], + [-4.71, 1.71, 60.52, 4.05, 1.46, 1.66, 1.56], + [-12.63, 1.88, 34.09, 1.95, 1.72, 0.50, + 1.54]]).cuda(), + box_dim=7) + ] + gt_labels = [torch.tensor([0, 0, 0, 1]).cuda()] + gt_labels_3d = gt_labels + centers2d = [ + torch.Tensor([[589.6528477, 198.3862263], [496.8143155, 190.75967182], + [553.40528354, 184.53785991], + [342.23690317, 194.44298819]]).cuda() + ] + # depths is actually not used in smoke head loss computation + depths = [torch.rand([3], dtype=torch.float32).cuda()] + attr_labels = None + img_metas = [ + dict( + cam_intrinsic=[[721.5377, 0., 609.5593, 0.], + [0., 721.5377, 172.854, 0.], [0., 0., 1., 0.], + [0., 0., 0., 1.]], + scale_factor=np.array([1., 1., 1., 1.], dtype=np.float32), + pad_shape=[384, 1280], + trans_mat=np.array([[0.25, 0., 0.], [0., 0.25, 0], [0., 0., 1.]], + dtype=np.float32), + affine_aug=False, + box_type_3d=CameraInstance3DBoxes) + ] + + # test forward_train + losses = self.forward_train(imgs, img_metas, gt_bboxes, gt_labels, + gt_bboxes_3d, gt_labels_3d, centers2d, depths, + attr_labels) + + assert losses['loss_cls'] >= 0 + assert losses['loss_bbox'] >= 0 + + # test simple_test + with torch.no_grad(): + results = self.simple_test(imgs, img_metas) + boxes_3d = results[0]['img_bbox']['boxes_3d'] + scores_3d = results[0]['img_bbox']['scores_3d'] + labels_3d = results[0]['img_bbox']['labels_3d'] + assert boxes_3d.tensor.shape[0] >= 0 + assert boxes_3d.tensor.shape[1] == 7 + assert scores_3d.shape[0] >= 0 + assert labels_3d.shape[0] >= 0 diff --git a/tests/test_models/test_forward.py b/tests/test_models/test_forward.py index eabacc0928..d6f4f1090e 100644 --- a/tests/test_models/test_forward.py +++ b/tests/test_models/test_forward.py @@ -6,9 +6,10 @@ xdoctest tests/test_models/test_forward.py zero """ import copy +from os.path import dirname, exists, join + import numpy as np import torch -from os.path import dirname, exists, join def _get_config_directory(): @@ -148,7 +149,7 @@ def _demo_mm_inputs(input_shape=(1, 3, 300, 300), input_shape (tuple): input batch dimensions - num_items (None | List[int]): + num_items (List[int]): specifies the number of boxes in each batch item num_classes (int): diff --git a/tests/test_models/test_heads/test_dgcnn_decode_head.py b/tests/test_models/test_heads/test_dgcnn_decode_head.py new file mode 100644 index 0000000000..6d1f149530 --- /dev/null +++ b/tests/test_models/test_heads/test_dgcnn_decode_head.py @@ -0,0 +1,68 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import pytest +import torch +from mmcv.cnn.bricks import ConvModule + +from mmdet3d.models.builder import build_head + + +def test_dgcnn_decode_head_loss(): + if not torch.cuda.is_available(): + pytest.skip('test requires GPU and torch+cuda') + dgcnn_decode_head_cfg = dict( + type='DGCNNHead', + fp_channels=(1024, 512), + channels=256, + num_classes=13, + dropout_ratio=0.5, + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d'), + act_cfg=dict(type='LeakyReLU', negative_slope=0.2), + loss_decode=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + class_weight=None, + loss_weight=1.0), + ignore_index=13) + + self = build_head(dgcnn_decode_head_cfg) + self.cuda() + assert isinstance(self.conv_seg, torch.nn.Conv1d) + assert self.conv_seg.in_channels == 256 + assert self.conv_seg.out_channels == 13 + assert self.conv_seg.kernel_size == (1, ) + assert isinstance(self.pre_seg_conv, ConvModule) + assert isinstance(self.pre_seg_conv.conv, torch.nn.Conv1d) + assert self.pre_seg_conv.conv.in_channels == 512 + assert self.pre_seg_conv.conv.out_channels == 256 + assert self.pre_seg_conv.conv.kernel_size == (1, ) + assert isinstance(self.pre_seg_conv.bn, torch.nn.BatchNorm1d) + assert self.pre_seg_conv.bn.num_features == 256 + + # test forward + fa_points = torch.rand(2, 4096, 1024).float().cuda() + input_dict = dict(fa_points=fa_points) + seg_logits = self(input_dict) + assert seg_logits.shape == torch.Size([2, 13, 4096]) + + # test loss + pts_semantic_mask = torch.randint(0, 13, (2, 4096)).long().cuda() + losses = self.losses(seg_logits, pts_semantic_mask) + assert losses['loss_sem_seg'].item() > 0 + + # test loss with ignore_index + ignore_index_mask = torch.ones_like(pts_semantic_mask) * 13 + losses = self.losses(seg_logits, ignore_index_mask) + assert losses['loss_sem_seg'].item() == 0 + + # test loss with class_weight + dgcnn_decode_head_cfg['loss_decode'] = dict( + type='CrossEntropyLoss', + use_sigmoid=False, + class_weight=np.random.rand(13), + loss_weight=1.0) + self = build_head(dgcnn_decode_head_cfg) + self.cuda() + losses = self.losses(seg_logits, pts_semantic_mask) + assert losses['loss_sem_seg'].item() > 0 diff --git a/tests/test_models/test_heads/test_heads.py b/tests/test_models/test_heads/test_heads.py index 01f1c09509..0f51f5e831 100644 --- a/tests/test_models/test_heads/test_heads.py +++ b/tests/test_models/test_heads/test_heads.py @@ -1,10 +1,12 @@ # Copyright (c) OpenMMLab. All rights reserved. import copy +import random +from os.path import dirname, exists, join + +import mmcv import numpy as np import pytest -import random import torch -from os.path import dirname, exists, join from mmdet3d.core.bbox import (Box3DMode, CameraInstance3DBoxes, DepthInstance3DBoxes, LiDARInstance3DBoxes) @@ -116,6 +118,23 @@ def _get_pts_bbox_head_cfg(fname): return pts_bbox_head +def _get_pointrcnn_rpn_head_cfg(fname): + """Grab configs necessary to create a rpn_head. + + These are deep copied to allow for safe modification of parameters without + influencing other tests. + """ + config = _get_config_module(fname) + model = copy.deepcopy(config.model) + train_cfg = mmcv.Config(copy.deepcopy(config.model.train_cfg)) + test_cfg = mmcv.Config(copy.deepcopy(config.model.test_cfg)) + + rpn_head = model.rpn_head + rpn_head.update(train_cfg=train_cfg.rpn) + rpn_head.update(test_cfg=test_cfg.rpn) + return rpn_head, train_cfg.rpn.rpn_proposal + + def _get_vote_head_cfg(fname): """Grab configs necessary to create a vote_head. @@ -147,6 +166,14 @@ def _get_parta2_bbox_head_cfg(fname): return vote_head +def _get_pointrcnn_bbox_head_cfg(fname): + config = _get_config_module(fname) + model = copy.deepcopy(config.model) + + vote_head = model.roi_head.bbox_head + return vote_head + + def test_anchor3d_head_loss(): if not torch.cuda.is_available(): pytest.skip('test requires GPU and torch+cuda') @@ -263,6 +290,39 @@ def test_parta2_rpnhead_getboxes(): assert result_list[0]['boxes_3d'].tensor.shape == torch.Size([512, 7]) +def test_pointrcnn_rpnhead_getboxes(): + if not torch.cuda.is_available(): + pytest.skip('test requires GPU and torch+cuda') + rpn_head_cfg, proposal_cfg = _get_pointrcnn_rpn_head_cfg( + './pointrcnn/pointrcnn_2x8_kitti-3d-3classes.py') + self = build_head(rpn_head_cfg) + self.cuda() + + fp_features = torch.rand([2, 128, 1024], dtype=torch.float32).cuda() + feats = {'fp_features': fp_features} + # fake input_metas + input_metas = [{ + 'sample_idx': 1234, + 'box_type_3d': LiDARInstance3DBoxes, + 'box_mode_3d': Box3DMode.LIDAR + }, { + 'sample_idx': 2345, + 'box_type_3d': LiDARInstance3DBoxes, + 'box_mode_3d': Box3DMode.LIDAR + }] + (bbox_preds, cls_preds) = self.forward(feats) + assert bbox_preds.shape == (2, 1024, 8) + assert cls_preds.shape == (2, 1024, 3) + points = torch.rand([2, 1024, 3], dtype=torch.float32).cuda() + result_list = self.get_bboxes(points, bbox_preds, cls_preds, input_metas) + max_num = proposal_cfg.max_num + bbox, score_selected, labels, cls_preds_selected = result_list[0] + assert bbox.tensor.shape == (max_num, 7) + assert score_selected.shape == (max_num, ) + assert labels.shape == (max_num, ) + assert cls_preds_selected.shape == (max_num, 3) + + def test_vote_head(): if not torch.cuda.is_available(): pytest.skip('test requires GPU and torch+cuda') @@ -358,6 +418,102 @@ def test_vote_head(): assert results[0][2].shape[0] >= 0 +def test_smoke_mono3d_head(): + + head_cfg = dict( + type='SMOKEMono3DHead', + num_classes=3, + in_channels=64, + dim_channel=[3, 4, 5], + ori_channel=[6, 7], + stacked_convs=0, + feat_channels=64, + use_direction_classifier=False, + diff_rad_by_sin=False, + pred_attrs=False, + pred_velo=False, + dir_offset=0, + strides=None, + group_reg_dims=(8, ), + cls_branch=(256, ), + reg_branch=((256, ), ), + num_attrs=0, + bbox_code_size=7, + dir_branch=(), + attr_branch=(), + bbox_coder=dict( + type='SMOKECoder', + base_depth=(28.01, 16.32), + base_dims=((0.88, 1.73, 0.67), (1.78, 1.70, 0.58), (3.88, 1.63, + 1.53)), + code_size=7), + loss_cls=dict(type='GaussianFocalLoss', loss_weight=1.0), + loss_bbox=dict(type='L1Loss', reduction='sum', loss_weight=1 / 300), + loss_dir=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_attr=None, + conv_bias=True, + dcn_on_last_conv=False) + + self = build_head(head_cfg) + + feats = [torch.rand([2, 64, 32, 32], dtype=torch.float32)] + + # test forward + ret_dict = self(feats) + + assert len(ret_dict) == 2 + assert len(ret_dict[0]) == 1 + assert ret_dict[0][0].shape == torch.Size([2, 3, 32, 32]) + assert ret_dict[1][0].shape == torch.Size([2, 8, 32, 32]) + + # test loss + gt_bboxes = [ + torch.Tensor([[1.0, 2.0, 20.0, 40.0], [45.0, 50.0, 80.0, 70.1], + [34.0, 39.0, 65.0, 64.0]]), + torch.Tensor([[11.0, 22.0, 29.0, 31.0], [41.0, 55.0, 60.0, 99.0], + [29.0, 29.0, 65.0, 56.0]]) + ] + gt_bboxes_3d = [ + CameraInstance3DBoxes(torch.rand([3, 7]), box_dim=7), + CameraInstance3DBoxes(torch.rand([3, 7]), box_dim=7) + ] + gt_labels = [torch.randint(0, 3, [3]) for i in range(2)] + gt_labels_3d = gt_labels + centers2d = [torch.randint(0, 60, (3, 2)), torch.randint(0, 40, (3, 2))] + depths = [ + torch.rand([3], dtype=torch.float32), + torch.rand([3], dtype=torch.float32) + ] + attr_labels = None + img_metas = [ + dict( + cam2img=[[1260.8474446004698, 0.0, 807.968244525554, 40.1111], + [0.0, 1260.8474446004698, 495.3344268742088, 2.34422], + [0.0, 0.0, 1.0, 0.00333333], [0.0, 0.0, 0.0, 1.0]], + scale_factor=np.array([1., 1., 1., 1.], dtype=np.float32), + pad_shape=[128, 128], + trans_mat=np.array([[0.25, 0., 0.], [0., 0.25, 0], [0., 0., 1.]], + dtype=np.float32), + affine_aug=False, + box_type_3d=CameraInstance3DBoxes) for i in range(2) + ] + losses = self.loss(*ret_dict, gt_bboxes, gt_labels, gt_bboxes_3d, + gt_labels_3d, centers2d, depths, attr_labels, img_metas) + + assert losses['loss_cls'] >= 0 + assert losses['loss_bbox'] >= 0 + + # test get_boxes + results = self.get_bboxes(*ret_dict, img_metas) + assert len(results) == 2 + assert len(results[0]) == 4 + assert results[0][0].tensor.shape == torch.Size([100, 7]) + assert results[0][1].shape == torch.Size([100]) + assert results[0][2].shape == torch.Size([100]) + assert results[0][3] is None + + def test_parta2_bbox_head(): parta2_bbox_head_cfg = _get_parta2_bbox_head_cfg( './parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class.py') @@ -370,6 +526,18 @@ def test_parta2_bbox_head(): assert bbox_pred.shape == (256, 7) +def test_pointrcnn_bbox_head(): + if not torch.cuda.is_available(): + pytest.skip('test requires GPU and torch+cuda') + pointrcnn_bbox_head_cfg = _get_pointrcnn_bbox_head_cfg( + './pointrcnn/pointrcnn_2x8_kitti-3d-3classes.py') + self = build_head(pointrcnn_bbox_head_cfg).cuda() + feats = torch.rand([100, 512, 133]).cuda() + rcnn_cls, rcnn_reg = self.forward(feats) + assert rcnn_cls.shape == (100, 1) + assert rcnn_reg.shape == (100, 7) + + def test_part_aggregation_ROI_head(): if not torch.cuda.is_available(): pytest.skip('test requires GPU and torch+cuda') @@ -444,6 +612,50 @@ def test_part_aggregation_ROI_head(): assert labels_3d.shape == (12, ) +def test_pointrcnn_roi_head(): + if not torch.cuda.is_available(): + pytest.skip('test requires GPU and torch+cuda') + + roi_head_cfg = _get_roi_head_cfg( + './pointrcnn/pointrcnn_2x8_kitti-3d-3classes.py') + + self = build_head(roi_head_cfg).cuda() + + features = torch.rand([3, 128, 16384]).cuda() + points = torch.rand([3, 16384, 3]).cuda() + points_cls_preds = torch.rand([3, 16384, 3]).cuda() + rcnn_feats = { + 'features': features, + 'points': points, + 'points_cls_preds': points_cls_preds + } + boxes_3d = LiDARInstance3DBoxes(torch.rand(50, 7).cuda()) + labels_3d = torch.randint(low=0, high=2, size=[50]).cuda() + proposal = {'boxes_3d': boxes_3d, 'labels_3d': labels_3d} + proposal_list = [proposal for i in range(3)] + gt_bboxes_3d = [ + LiDARInstance3DBoxes(torch.rand([5, 7], device='cuda')) + for i in range(3) + ] + gt_labels_3d = [torch.randint(0, 2, [5], device='cuda') for i in range(3)] + box_type_3d = LiDARInstance3DBoxes + img_metas = [dict(box_type_3d=box_type_3d) for i in range(3)] + + losses = self.forward_train(rcnn_feats, img_metas, proposal_list, + gt_bboxes_3d, gt_labels_3d) + assert losses['loss_cls'] >= 0 + assert losses['loss_bbox'] >= 0 + assert losses['loss_corner'] >= 0 + + bbox_results = self.simple_test(rcnn_feats, img_metas, proposal_list) + boxes_3d = bbox_results[0]['boxes_3d'] + scores_3d = bbox_results[0]['scores_3d'] + labels_3d = bbox_results[0]['labels_3d'] + assert boxes_3d.tensor.shape[1] == 7 + assert boxes_3d.tensor.shape[0] == scores_3d.shape[0] + assert scores_3d.shape[0] == labels_3d.shape[0] + + def test_free_anchor_3D_head(): if not torch.cuda.is_available(): pytest.skip('test requires GPU and torch+cuda') @@ -604,7 +816,7 @@ def test_h3d_head(): h3d_head_cfg.bbox_head.num_proposal = num_proposal self = build_head(h3d_head_cfg).cuda() - # prepare roi outputs + # prepare RoI outputs fp_xyz = [torch.rand([1, num_point, 3], dtype=torch.float32).cuda()] hd_features = torch.rand([1, 256, num_point], dtype=torch.float32).cuda() fp_indices = [torch.randint(0, 128, [1, num_point]).cuda()] @@ -1144,7 +1356,7 @@ def test_groupfree3d_head(): assert ret_dict['s5.sem_scores'].shape == torch.Size([2, 256, 18]) # test losses - points = [torch.rand([50000, 4], device='cuda') for i in range(2)] + points = [torch.rand([5000, 4], device='cuda') for i in range(2)] gt_bbox1 = torch.rand([10, 7], dtype=torch.float32).cuda() gt_bbox2 = torch.rand([10, 7], dtype=torch.float32).cuda() @@ -1152,12 +1364,12 @@ def test_groupfree3d_head(): gt_bbox2 = DepthInstance3DBoxes(gt_bbox2) gt_bboxes = [gt_bbox1, gt_bbox2] - pts_instance_mask_1 = torch.randint(0, 10, [50000], device='cuda') - pts_instance_mask_2 = torch.randint(0, 10, [50000], device='cuda') + pts_instance_mask_1 = torch.randint(0, 10, [5000], device='cuda') + pts_instance_mask_2 = torch.randint(0, 10, [5000], device='cuda') pts_instance_mask = [pts_instance_mask_1, pts_instance_mask_2] - pts_semantic_mask_1 = torch.randint(0, 19, [50000], device='cuda') - pts_semantic_mask_2 = torch.randint(0, 19, [50000], device='cuda') + pts_semantic_mask_1 = torch.randint(0, 19, [5000], device='cuda') + pts_semantic_mask_2 = torch.randint(0, 19, [5000], device='cuda') pts_semantic_mask = [pts_semantic_mask_1, pts_semantic_mask_2] labels_1 = torch.randint(0, 18, [10], device='cuda') @@ -1178,7 +1390,7 @@ def test_groupfree3d_head(): # test multiclass_nms_single obj_scores = torch.rand([256], device='cuda') sem_scores = torch.rand([256, 18], device='cuda') - points = torch.rand([50000, 3], device='cuda') + points = torch.rand([5000, 3], device='cuda') bbox = torch.rand([256, 7], device='cuda') input_meta = dict(box_type_3d=DepthInstance3DBoxes) bbox_selected, score_selected, labels = \ @@ -1193,9 +1405,9 @@ def test_groupfree3d_head(): assert labels.shape[0] >= 0 # test get_boxes - points = torch.rand([1, 50000, 3], device='cuda') + points = torch.rand([1, 5000, 3], device='cuda') seed_points = torch.rand([1, 1024, 3], device='cuda') - seed_indices = torch.randint(0, 50000, [1, 1024], device='cuda') + seed_indices = torch.randint(0, 5000, [1, 1024], device='cuda') obj_scores = torch.rand([1, 256, 1], device='cuda') center = torch.rand([1, 256, 3], device='cuda') dir_class = torch.rand([1, 256, 1], device='cuda') @@ -1222,3 +1434,134 @@ def test_groupfree3d_head(): assert results[0][0].tensor.shape[1] == 7 assert results[0][1].shape[0] >= 0 assert results[0][2].shape[0] >= 0 + + +def test_pgd_head(): + if not torch.cuda.is_available(): + pytest.skip('test requires GPU and torch+cuda') + _setup_seed(0) + pgd_head_cfg = _get_head_cfg( + 'pgd/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d.py') + self = build_head(pgd_head_cfg).cuda() + + feats = [ + torch.rand([2, 256, 96, 312], dtype=torch.float32).cuda(), + torch.rand([2, 256, 48, 156], dtype=torch.float32).cuda(), + torch.rand([2, 256, 24, 78], dtype=torch.float32).cuda(), + torch.rand([2, 256, 12, 39], dtype=torch.float32).cuda(), + ] + + # test forward + ret_dict = self(feats) + assert len(ret_dict) == 7 + assert len(ret_dict[0]) == 4 + assert ret_dict[0][0].shape == torch.Size([2, 3, 96, 312]) + + # test loss + gt_bboxes = [ + torch.rand([3, 4], dtype=torch.float32).cuda(), + torch.rand([3, 4], dtype=torch.float32).cuda() + ] + gt_bboxes_3d = CameraInstance3DBoxes( + torch.rand([3, 7], device='cuda'), box_dim=7) + gt_labels = [torch.randint(0, 3, [3], device='cuda') for i in range(2)] + gt_labels_3d = gt_labels + centers2d = [ + torch.rand([3, 2], dtype=torch.float32).cuda(), + torch.rand([3, 2], dtype=torch.float32).cuda() + ] + depths = [ + torch.rand([3], dtype=torch.float32).cuda(), + torch.rand([3], dtype=torch.float32).cuda() + ] + attr_labels = None + img_metas = [ + dict( + img_shape=[384, 1248], + cam2img=[[721.5377, 0.0, 609.5593, 44.85728], + [0.0, 721.5377, 172.854, 0.2163791], + [0.0, 0.0, 1.0, 0.002745884], [0.0, 0.0, 0.0, 1.0]], + scale_factor=np.array([1., 1., 1., 1.], dtype=np.float32), + box_type_3d=CameraInstance3DBoxes) for i in range(2) + ] + losses = self.loss(*ret_dict, gt_bboxes, gt_labels, gt_bboxes_3d, + gt_labels_3d, centers2d, depths, attr_labels, img_metas) + assert losses['loss_cls'] >= 0 + assert losses['loss_offset'] >= 0 + assert losses['loss_depth'] >= 0 + assert losses['loss_size'] >= 0 + assert losses['loss_rotsin'] >= 0 + assert losses['loss_centerness'] >= 0 + assert losses['loss_kpts'] >= 0 + assert losses['loss_bbox2d'] >= 0 + assert losses['loss_consistency'] >= 0 + assert losses['loss_dir'] >= 0 + + # test get_boxes + results = self.get_bboxes(*ret_dict, img_metas) + assert len(results) == 2 + assert len(results[0]) == 5 + assert results[0][0].tensor.shape == torch.Size([20, 7]) + assert results[0][1].shape == torch.Size([20]) + assert results[0][2].shape == torch.Size([20]) + assert results[0][3] is None + assert results[0][4].shape == torch.Size([20, 5]) + + +def test_monoflex_head(): + + head_cfg = dict( + type='MonoFlexHead', + num_classes=3, + in_channels=64, + use_edge_fusion=True, + edge_fusion_inds=[(1, 0)], + edge_heatmap_ratio=1 / 8, + stacked_convs=0, + feat_channels=64, + use_direction_classifier=False, + diff_rad_by_sin=False, + pred_attrs=False, + pred_velo=False, + dir_offset=0, + strides=None, + group_reg_dims=((4, ), (2, ), (20, ), (3, ), (3, ), (8, 8), (1, ), + (1, )), + cls_branch=(256, ), + reg_branch=((256, ), (256, ), (256, ), (256, ), (256, ), (256, ), + (256, ), (256, )), + num_attrs=0, + bbox_code_size=7, + dir_branch=(), + attr_branch=(), + bbox_coder=dict( + type='MonoFlexCoder', + depth_mode='exp', + base_depth=(26.494627, 16.05988), + depth_range=[0.1, 100], + combine_depth=True, + uncertainty_range=[-10, 10], + base_dims=((3.8840, 1.5261, 1.6286, 0.4259, 0.1367, 0.1022), + (0.8423, 1.7607, 0.6602, 0.2349, 0.1133, 0.1427), + (1.7635, 1.7372, 0.5968, 0.1766, 0.0948, 0.1242)), + dims_mode='linear', + multibin=True, + num_dir_bins=4, + bin_centers=[0, np.pi / 2, np.pi, -np.pi / 2], + bin_margin=np.pi / 6, + code_size=7), + conv_bias=True, + dcn_on_last_conv=False) + + self = build_head(head_cfg) + + feats = [torch.rand([2, 64, 32, 32], dtype=torch.float32)] + + input_metas = [ + dict(img_shape=(110, 110), pad_shape=(128, 128)), + dict(img_shape=(98, 110), pad_shape=(128, 128)) + ] + cls_score, out_reg = self(feats, input_metas) + + assert cls_score[0].shape == torch.Size([2, 3, 32, 32]) + assert out_reg[0].shape == torch.Size([2, 50, 32, 32]) diff --git a/tests/test_models/test_heads/test_parta2_bbox_head.py b/tests/test_models/test_heads/test_parta2_bbox_head.py index c30d638b71..140e72e087 100644 --- a/tests/test_models/test_heads/test_parta2_bbox_head.py +++ b/tests/test_models/test_heads/test_parta2_bbox_head.py @@ -76,7 +76,7 @@ def test_loss(): 2.0579e-02, 1.5005e-04, 3.5252e-05, 0.0000e+00, 2.0433e-05, 1.5422e-05 ]) expected_loss_bbox = torch.as_tensor(0.0622) - expected_loss_corner = torch.Tensor([0.1379]) + expected_loss_corner = torch.Tensor([0.1374]) assert torch.allclose(loss['loss_cls'], expected_loss_cls, 1e-3) assert torch.allclose(loss['loss_bbox'], expected_loss_bbox, 1e-3) @@ -201,7 +201,7 @@ def test_get_targets(): ]) expected_bbox_targets = torch.Tensor( - [[0.0805, 0.0130, 0.0047, 0.0542, -0.2252, 0.0299, -0.1495]]) + [[-0.0632, 0.0516, 0.0047, 0.0542, -0.2252, 0.0299, -0.1495]]) expected_pos_gt_bboxes = torch.Tensor( [[7.8417, -0.1405, -1.9652, 1.6122, 3.2838, 1.5331, -2.0835]]) @@ -345,12 +345,11 @@ def test_get_bboxes(): selected_bboxes, selected_scores, selected_label_preds = result_list[0] expected_selected_bboxes = torch.Tensor( - [[56.2170, 25.9074, -1.3610, 1.6025, 3.6730, 1.5128, -0.1179], - [54.6521, 28.8846, -1.9145, 1.6362, 4.0573, 1.5599, -1.7335], - [31.6179, -5.6004, -1.2470, 1.6458, 4.1622, 1.5632, -1.5734]]).cuda() + [[56.0888, 25.6445, -1.3610, 1.6025, 3.6730, 1.5128, -0.1179], + [54.4606, 29.2412, -1.9145, 1.6362, 4.0573, 1.5599, -1.7335], + [31.8887, -5.8574, -1.2470, 1.6458, 4.1622, 1.5632, -1.5734]]).cuda() expected_selected_scores = torch.Tensor([-2.2061, -2.1121, -0.1761]).cuda() expected_selected_label_preds = torch.Tensor([2., 2., 2.]).cuda() - assert torch.allclose(selected_bboxes.tensor, expected_selected_bboxes, 1e-3) assert torch.allclose(selected_scores, expected_selected_scores, 1e-3) @@ -387,43 +386,43 @@ def test_multi_class_nms(): box_preds = torch.Tensor( [[ 5.6217e+01, 2.5908e+01, -1.3611e+00, 1.6025e+00, 3.6730e+00, - 1.5129e+00, -1.1786e-01 + 1.5129e+00, 1.1786e-01 ], [ 5.4653e+01, 2.8885e+01, -1.9145e+00, 1.6362e+00, 4.0574e+00, - 1.5599e+00, -1.7335e+00 + 1.5599e+00, 1.7335e+00 ], [ 5.5809e+01, 2.5686e+01, -1.4457e+00, 1.5939e+00, 3.8270e+00, - 1.4997e+00, -2.9191e+00 + 1.4997e+00, 2.9191e+00 ], [ 5.6107e+01, 2.6082e+01, -1.3557e+00, 1.5782e+00, 3.7444e+00, - 1.5266e+00, 1.7707e-01 + 1.5266e+00, -1.7707e-01 ], [ 3.1618e+01, -5.6004e+00, -1.2470e+00, 1.6459e+00, 4.1622e+00, - 1.5632e+00, -1.5734e+00 + 1.5632e+00, 1.5734e+00 ], [ 3.1605e+01, -5.6342e+00, -1.2467e+00, 1.6474e+00, 4.1519e+00, - 1.5481e+00, -1.6313e+00 + 1.5481e+00, 1.6313e+00 ], [ 5.6211e+01, 2.7294e+01, -1.5350e+00, 1.5422e+00, 3.7733e+00, - 1.5140e+00, 9.5846e-02 + 1.5140e+00, -9.5846e-02 ], [ 5.5907e+01, 2.7155e+01, -1.4712e+00, 1.5416e+00, 3.7611e+00, - 1.5142e+00, -5.2059e-02 + 1.5142e+00, 5.2059e-02 ], [ 5.4000e+01, 3.0585e+01, -1.6874e+00, 1.6495e+00, 4.0376e+00, - 1.5554e+00, -1.7900e+00 + 1.5554e+00, 1.7900e+00 ], [ 5.6007e+01, 2.6300e+01, -1.3945e+00, 1.5716e+00, 3.7064e+00, - 1.4715e+00, -2.9639e+00 + 1.4715e+00, 2.9639e+00 ]]).cuda() input_meta = dict( diff --git a/tests/test_models/test_heads/test_roi_extractors.py b/tests/test_models/test_heads/test_roi_extractors.py index c6f21c3d8d..13296316ed 100644 --- a/tests/test_models/test_heads/test_roi_extractors.py +++ b/tests/test_models/test_heads/test_roi_extractors.py @@ -1,8 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. +import numpy as np import pytest import torch -from mmdet3d.models.roi_heads.roi_extractors import Single3DRoIAwareExtractor +from mmdet3d.models.roi_heads.roi_extractors import (Single3DRoIAwareExtractor, + Single3DRoIPointExtractor) def test_single_roiaware_extractor(): @@ -21,11 +23,35 @@ def test_single_roiaware_extractor(): dtype=torch.float32).cuda() coordinate = feats.clone() batch_inds = torch.zeros(feats.shape[0]).cuda() - rois = torch.tensor([[0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3], - [0, -10.0, 23.0, 16.0, 10, 20, 20, 0.5]], + rois = torch.tensor([[0, 1.0, 2.0, 3.0, 5.0, 4.0, 6.0, -0.3 - np.pi / 2], + [0, -10.0, 23.0, 16.0, 20, 10, 20, -0.5 - np.pi / 2]], dtype=torch.float32).cuda() # test forward pooled_feats = self(feats, coordinate, batch_inds, rois) assert pooled_feats.shape == torch.Size([2, 4, 4, 4, 3]) assert torch.allclose(pooled_feats.sum(), torch.tensor(51.100).cuda(), 1e-3) + + +def test_single_roipoint_extractor(): + if not torch.cuda.is_available(): + pytest.skip('test requires GPU and torch+cuda') + + roi_layer_cfg = dict( + type='RoIPointPool3d', num_sampled_points=512, pool_extra_width=0) + + self = Single3DRoIPointExtractor(roi_layer=roi_layer_cfg) + + feats = torch.tensor( + [[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6], + [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3], + [4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [-16, -18, 9], + [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]], + dtype=torch.float32).unsqueeze(0).cuda() + points = feats.clone() + batch_inds = feats.shape[0] + rois = torch.tensor([[0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3], + [0, -10.0, 23.0, 16.0, 10, 20, 20, 0.5]], + dtype=torch.float32).cuda() + pooled_feats = self(feats, points, batch_inds, rois) + assert pooled_feats.shape == torch.Size([2, 512, 6]) diff --git a/tests/test_models/test_heads/test_semantic_heads.py b/tests/test_models/test_heads/test_semantic_heads.py index 59f8a6b607..e259ecbff7 100644 --- a/tests/test_models/test_heads/test_semantic_heads.py +++ b/tests/test_models/test_heads/test_semantic_heads.py @@ -53,11 +53,11 @@ def test_PointwiseSemanticHead(): gt_bboxes = [ LiDARInstance3DBoxes( torch.tensor( - [[6.4118, -3.4305, -1.7291, 1.7033, 3.4693, 1.6197, -0.9091]], + [[6.4118, -3.4305, -1.7291, 1.7033, 3.4693, 1.6197, 0.9091]], dtype=torch.float32).cuda()), LiDARInstance3DBoxes( torch.tensor( - [[16.9107, 9.7925, -1.9201, 1.6097, 3.2786, 1.5307, -2.4056]], + [[16.9107, 9.7925, -1.9201, 1.6097, 3.2786, 1.5307, 2.4056]], dtype=torch.float32).cuda()) ] # batch size is 2 in the unit test diff --git a/tests/test_models/test_necks/test_necks.py b/tests/test_models/test_necks/test_necks.py index 1e48493cb8..c7bb28a0b6 100644 --- a/tests/test_models/test_necks/test_necks.py +++ b/tests/test_models/test_necks/test_necks.py @@ -57,3 +57,78 @@ def test_imvoxel_neck(): inputs = torch.rand([1, 64, 216, 248, 12], device='cuda') outputs = neck(inputs) assert outputs[0].shape == (1, 256, 248, 216) + + +def test_fp_neck(): + if not torch.cuda.is_available(): + pytest.skip() + + xyzs = [16384, 4096, 1024, 256, 64] + feat_channels = [1, 96, 256, 512, 1024] + channel_num = 5 + + sa_xyz = [torch.rand(3, xyzs[i], 3) for i in range(channel_num)] + sa_features = [ + torch.rand(3, feat_channels[i], xyzs[i]) for i in range(channel_num) + ] + + neck_cfg = dict( + type='PointNetFPNeck', + fp_channels=((1536, 512, 512), (768, 512, 512), (608, 256, 256), + (257, 128, 128))) + + neck = build_neck(neck_cfg) + neck.init_weights() + + if torch.cuda.is_available(): + sa_xyz = [x.cuda() for x in sa_xyz] + sa_features = [x.cuda() for x in sa_features] + neck.cuda() + + feats_sa = {'sa_xyz': sa_xyz, 'sa_features': sa_features} + outputs = neck(feats_sa) + assert outputs['fp_xyz'].cpu().numpy().shape == (3, 16384, 3) + assert outputs['fp_features'].detach().cpu().numpy().shape == (3, 128, + 16384) + + +def test_dla_neck(): + + s = 32 + in_channels = [16, 32, 64, 128, 256, 512] + feat_sizes = [s // 2**i for i in range(6)] # [32, 16, 8, 4, 2, 1] + + if torch.cuda.is_available(): + # Test DLA Neck with DCNv2 on GPU + neck_cfg = dict( + type='DLANeck', + in_channels=[16, 32, 64, 128, 256, 512], + start_level=2, + end_level=5, + norm_cfg=dict(type='GN', num_groups=32)) + neck = build_neck(neck_cfg) + neck.init_weights() + neck.cuda() + feats = [ + torch.rand(4, in_channels[i], feat_sizes[i], feat_sizes[i]).cuda() + for i in range(len(in_channels)) + ] + outputs = neck(feats) + assert outputs.shape == (4, 64, 8, 8) + else: + # Test DLA Neck without DCNv2 on CPU + neck_cfg = dict( + type='DLANeck', + in_channels=[16, 32, 64, 128, 256, 512], + start_level=2, + end_level=5, + norm_cfg=dict(type='GN', num_groups=32), + use_dcn=False) + neck = build_neck(neck_cfg) + neck.init_weights() + feats = [ + torch.rand(4, in_channels[i], feat_sizes[i], feat_sizes[i]) + for i in range(len(in_channels)) + ] + outputs = neck(feats) + assert outputs[0].shape == (4, 64, 8, 8) diff --git a/tests/test_models/test_segmentors.py b/tests/test_models/test_segmentors.py index faff3f9515..73904e6937 100644 --- a/tests/test_models/test_segmentors.py +++ b/tests/test_models/test_segmentors.py @@ -1,9 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. import copy +from os.path import dirname, exists, join + import numpy as np import pytest import torch -from os.path import dirname, exists, join from mmdet3d.models.builder import build_segmentor from mmdet.apis import set_random_seed @@ -304,3 +305,48 @@ def test_paconv_cuda_ssg(): results = self.forward(return_loss=False, **data_dict) assert results[0]['semantic_mask'].shape == torch.Size([200]) assert results[1]['semantic_mask'].shape == torch.Size([100]) + + +def test_dgcnn(): + if not torch.cuda.is_available(): + pytest.skip('test requires GPU and torch+cuda') + + set_random_seed(0, True) + dgcnn_cfg = _get_segmentor_cfg( + 'dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class.py') + dgcnn_cfg.test_cfg.num_points = 32 + self = build_segmentor(dgcnn_cfg).cuda() + points = [torch.rand(4096, 9).float().cuda() for _ in range(2)] + img_metas = [dict(), dict()] + gt_masks = [torch.randint(0, 13, (4096, )).long().cuda() for _ in range(2)] + + # test forward_train + losses = self.forward_train(points, img_metas, gt_masks) + assert losses['decode.loss_sem_seg'].item() >= 0 + + # test loss with ignore_index + ignore_masks = [torch.ones_like(gt_masks[0]) * 13 for _ in range(2)] + losses = self.forward_train(points, img_metas, ignore_masks) + assert losses['decode.loss_sem_seg'].item() == 0 + + # test simple_test + self.eval() + with torch.no_grad(): + scene_points = [ + torch.randn(500, 6).float().cuda() * 3.0, + torch.randn(200, 6).float().cuda() * 2.5 + ] + results = self.simple_test(scene_points, img_metas) + assert results[0]['semantic_mask'].shape == torch.Size([500]) + assert results[1]['semantic_mask'].shape == torch.Size([200]) + + # test aug_test + with torch.no_grad(): + scene_points = [ + torch.randn(2, 500, 6).float().cuda() * 3.0, + torch.randn(2, 200, 6).float().cuda() * 2.5 + ] + img_metas = [[dict(), dict()], [dict(), dict()]] + results = self.aug_test(scene_points, img_metas) + assert results[0]['semantic_mask'].shape == torch.Size([500]) + assert results[1]['semantic_mask'].shape == torch.Size([200]) diff --git a/tests/test_runtime/test_apis.py b/tests/test_runtime/test_apis.py index be1f555457..1f2255bc76 100644 --- a/tests/test_runtime/test_apis.py +++ b/tests/test_runtime/test_apis.py @@ -1,11 +1,12 @@ # Copyright (c) OpenMMLab. All rights reserved. -import numpy as np import os -import pytest import tempfile +from os.path import dirname, exists, join + +import numpy as np +import pytest import torch from mmcv.parallel import MMDataParallel -from os.path import dirname, exists, join from mmdet3d.apis import (convert_SyncBN, inference_detector, inference_mono_3d_detector, diff --git a/tests/test_runtime/test_config.py b/tests/test_runtime/test_config.py index a1950e283b..650b46df65 100644 --- a/tests/test_runtime/test_config.py +++ b/tests/test_runtime/test_config.py @@ -61,6 +61,8 @@ def test_config_build_model(): check_parta2_roi_head(head_config, detector.roi_head) elif head_config.type == 'H3DRoIHead': check_h3d_roi_head(head_config, detector.roi_head) + elif head_config.type == 'PointRCNNRoIHead': + check_pointrcnn_roi_head(head_config, detector.roi_head) else: _check_roi_head(head_config, detector.roi_head) # else: @@ -273,3 +275,28 @@ def _check_h3d_bbox_head(bbox_cfg, bbox_head): 12 == bbox_head.line_center_matcher.num_point[0] assert bbox_cfg.suface_matching_cfg.mlp_channels[-1] * \ 18 == bbox_head.bbox_pred[0].in_channels + + +def check_pointrcnn_roi_head(config, head): + assert config['type'] == head.__class__.__name__ + + # check point_roi_extractor + point_roi_cfg = config.point_roi_extractor + point_roi_extractor = head.point_roi_extractor + _check_pointrcnn_roi_extractor(point_roi_cfg, point_roi_extractor) + # check pointrcnn rcnn bboxhead + bbox_cfg = config.bbox_head + bbox_head = head.bbox_head + _check_pointrcnn_bbox_head(bbox_cfg, bbox_head) + + +def _check_pointrcnn_roi_extractor(config, roi_extractor): + assert config['type'] == roi_extractor.__class__.__name__ + assert config.roi_layer.num_sampled_points == \ + roi_extractor.roi_layer.num_sampled_points + + +def _check_pointrcnn_bbox_head(bbox_cfg, bbox_head): + assert bbox_cfg['type'] == bbox_head.__class__.__name__ + assert bbox_cfg.num_classes == bbox_head.num_classes + assert bbox_cfg.with_corner_loss == bbox_head.with_corner_loss diff --git a/tests/test_utils/test_anchors.py b/tests/test_utils/test_anchors.py index 0e7d2cb498..5de25e26cc 100644 --- a/tests/test_utils/test_anchors.py +++ b/tests/test_utils/test_anchors.py @@ -22,7 +22,7 @@ def test_anchor_3d_range_generator(): [0, -39.68, -0.6, 70.4, 39.68, -0.6], [0, -39.68, -1.78, 70.4, 39.68, -1.78], ], - sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]], + sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]], rotations=[0, 1.57], reshape_out=False) @@ -32,8 +32,8 @@ def test_anchor_3d_range_generator(): '[[0, -39.68, -0.6, 70.4, 39.68, -0.6], ' \ '[0, -39.68, -0.6, 70.4, 39.68, -0.6], ' \ '[0, -39.68, -1.78, 70.4, 39.68, -1.78]],' \ - '\nscales=[1],\nsizes=[[0.6, 0.8, 1.73], ' \ - '[0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],' \ + '\nscales=[1],\nsizes=[[0.8, 0.6, 1.73], ' \ + '[1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],' \ '\nrotations=[0, 1.57],\nreshape_out=False,' \ '\nsize_per_range=True)' assert repr_str == expected_repr_str @@ -54,8 +54,8 @@ def test_aligned_anchor_generator(): ranges=[[-51.2, -51.2, -1.80, 51.2, 51.2, -1.80]], scales=[1, 2, 4], sizes=[ - [0.8660, 2.5981, 1.], # 1.5/sqrt(3) - [0.5774, 1.7321, 1.], # 1/sqrt(3) + [2.5981, 0.8660, 1.], # 1.5/sqrt(3) + [1.7321, 0.5774, 1.], # 1/sqrt(3) [1., 1., 1.], [0.4, 0.4, 1], ], @@ -71,7 +71,7 @@ def test_aligned_anchor_generator(): # check base anchors expected_grid_anchors = [ torch.tensor([[ - -51.0000, -51.0000, -1.8000, 0.8660, 2.5981, 1.0000, 0.0000, + -51.0000, -51.0000, -1.8000, 2.5981, 0.8660, 1.0000, 0.0000, 0.0000, 0.0000 ], [ @@ -91,20 +91,20 @@ def test_aligned_anchor_generator(): 0.0000, 0.0000, 0.0000 ], [ - -49.4000, -51.0000, -1.8000, 0.5774, 1.7321, 1.0000, + -49.4000, -51.0000, -1.8000, 1.7321, 0.5774, 1.0000, 1.5700, 0.0000, 0.0000 ], [ - -49.0000, -51.0000, -1.8000, 0.5774, 1.7321, 1.0000, + -49.0000, -51.0000, -1.8000, 1.7321, 0.5774, 1.0000, 0.0000, 0.0000, 0.0000 ], [ - -48.6000, -51.0000, -1.8000, 0.8660, 2.5981, 1.0000, + -48.6000, -51.0000, -1.8000, 2.5981, 0.8660, 1.0000, 1.5700, 0.0000, 0.0000 ]], device=device), torch.tensor([[ - -50.8000, -50.8000, -1.8000, 1.7320, 5.1962, 2.0000, 0.0000, + -50.8000, -50.8000, -1.8000, 5.1962, 1.7320, 2.0000, 0.0000, 0.0000, 0.0000 ], [ @@ -124,20 +124,20 @@ def test_aligned_anchor_generator(): 0.0000, 0.0000, 0.0000 ], [ - -47.6000, -50.8000, -1.8000, 1.1548, 3.4642, 2.0000, + -47.6000, -50.8000, -1.8000, 3.4642, 1.1548, 2.0000, 1.5700, 0.0000, 0.0000 ], [ - -46.8000, -50.8000, -1.8000, 1.1548, 3.4642, 2.0000, + -46.8000, -50.8000, -1.8000, 3.4642, 1.1548, 2.0000, 0.0000, 0.0000, 0.0000 ], [ - -46.0000, -50.8000, -1.8000, 1.7320, 5.1962, 2.0000, + -46.0000, -50.8000, -1.8000, 5.1962, 1.7320, 2.0000, 1.5700, 0.0000, 0.0000 ]], device=device), torch.tensor([[ - -50.4000, -50.4000, -1.8000, 3.4640, 10.3924, 4.0000, 0.0000, + -50.4000, -50.4000, -1.8000, 10.3924, 3.4640, 4.0000, 0.0000, 0.0000, 0.0000 ], [ @@ -157,15 +157,15 @@ def test_aligned_anchor_generator(): 0.0000, 0.0000, 0.0000 ], [ - -44.0000, -50.4000, -1.8000, 2.3096, 6.9284, 4.0000, + -44.0000, -50.4000, -1.8000, 6.9284, 2.3096, 4.0000, 1.5700, 0.0000, 0.0000 ], [ - -42.4000, -50.4000, -1.8000, 2.3096, 6.9284, 4.0000, + -42.4000, -50.4000, -1.8000, 6.9284, 2.3096, 4.0000, 0.0000, 0.0000, 0.0000 ], [ - -40.8000, -50.4000, -1.8000, 3.4640, 10.3924, 4.0000, + -40.8000, -50.4000, -1.8000, 10.3924, 3.4640, 4.0000, 1.5700, 0.0000, 0.0000 ]], device=device) @@ -194,7 +194,7 @@ def test_aligned_anchor_generator_per_cls(): type='AlignedAnchor3DRangeGeneratorPerCls', ranges=[[-100, -100, -1.80, 100, 100, -1.80], [-100, -100, -1.30, 100, 100, -1.30]], - sizes=[[0.63, 1.76, 1.44], [0.96, 2.35, 1.59]], + sizes=[[1.76, 0.63, 1.44], [2.35, 0.96, 1.59]], custom_values=[0, 0], rotations=[0, 1.57], reshape_out=False) @@ -205,20 +205,20 @@ def test_aligned_anchor_generator_per_cls(): # check base anchors expected_grid_anchors = [[ torch.tensor([[ - -99.0000, -99.0000, -1.8000, 0.6300, 1.7600, 1.4400, 0.0000, + -99.0000, -99.0000, -1.8000, 1.7600, 0.6300, 1.4400, 0.0000, 0.0000, 0.0000 ], [ - -99.0000, -99.0000, -1.8000, 0.6300, 1.7600, 1.4400, + -99.0000, -99.0000, -1.8000, 1.7600, 0.6300, 1.4400, 1.5700, 0.0000, 0.0000 ]], device=device), torch.tensor([[ - -98.0000, -98.0000, -1.3000, 0.9600, 2.3500, 1.5900, 0.0000, + -98.0000, -98.0000, -1.3000, 2.3500, 0.9600, 1.5900, 0.0000, 0.0000, 0.0000 ], [ - -98.0000, -98.0000, -1.3000, 0.9600, 2.3500, 1.5900, + -98.0000, -98.0000, -1.3000, 2.3500, 0.9600, 1.5900, 1.5700, 0.0000, 0.0000 ]], device=device) diff --git a/tests/test_utils/test_bbox_coders.py b/tests/test_utils/test_bbox_coders.py index 7c836a87f5..f16bee04a6 100644 --- a/tests/test_utils/test_bbox_coders.py +++ b/tests/test_utils/test_bbox_coders.py @@ -1,7 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. +import numpy as np import torch +from mmcv.cnn import Scale +from torch import nn as nn -from mmdet3d.core.bbox import DepthInstance3DBoxes, LiDARInstance3DBoxes +from mmdet3d.core.bbox import (CameraInstance3DBoxes, DepthInstance3DBoxes, + LiDARInstance3DBoxes) from mmdet.core import build_bbox_coder @@ -352,3 +356,310 @@ def test_centerpoint_bbox_coder(): assert temp[i]['bboxes'].shape == torch.Size([500, 9]) assert temp[i]['scores'].shape == torch.Size([500]) assert temp[i]['labels'].shape == torch.Size([500]) + + +def test_point_xyzwhlr_bbox_coder(): + bbox_coder_cfg = dict( + type='PointXYZWHLRBBoxCoder', + use_mean_size=True, + mean_size=[[3.9, 1.6, 1.56], [0.8, 0.6, 1.73], [1.76, 0.6, 1.73]]) + boxcoder = build_bbox_coder(bbox_coder_cfg) + + # test encode + gt_bboxes_3d = torch.tensor( + [[13.3329, 2.3514, -0.7004, 1.7508, 0.4702, 1.7909, -3.0522], + [2.2068, -2.6994, -0.3277, 3.8703, 1.6602, 1.6913, -1.9057], + [5.5269, 2.5085, -1.0129, 1.1496, 0.8006, 1.8887, 2.1756]]) + + points = torch.tensor([[13.70, 2.40, 0.12], [3.20, -3.00, 0.2], + [5.70, 2.20, -0.4]]) + + gt_labels_3d = torch.tensor([2, 0, 1]) + + bbox_target = boxcoder.encode(gt_bboxes_3d, points, gt_labels_3d) + expected_bbox_target = torch.tensor([[ + -0.1974, -0.0261, -0.4742, -0.0052, -0.2438, 0.0346, -0.9960, -0.0893 + ], [-0.2356, 0.0713, -0.3383, -0.0076, 0.0369, 0.0808, -0.3287, -0.9444 + ], [-0.1731, 0.3085, -0.3543, 0.3626, 0.2884, 0.0878, -0.5686, + 0.8226]]) + assert torch.allclose(expected_bbox_target, bbox_target, atol=1e-4) + # test decode + bbox3d_out = boxcoder.decode(bbox_target, points, gt_labels_3d) + assert torch.allclose(bbox3d_out, gt_bboxes_3d, atol=1e-4) + + +def test_fcos3d_bbox_coder(): + # test a config without priors + bbox_coder_cfg = dict( + type='FCOS3DBBoxCoder', + base_depths=None, + base_dims=None, + code_size=7, + norm_on_bbox=True) + bbox_coder = build_bbox_coder(bbox_coder_cfg) + + # test decode + # [2, 7, 1, 1] + batch_bbox = torch.tensor([[[[0.3130]], [[0.7094]], [[0.8743]], [[0.0570]], + [[0.5579]], [[0.1593]], [[0.4553]]], + [[[0.7758]], [[0.2298]], [[0.3925]], [[0.6307]], + [[0.4377]], [[0.3339]], [[0.1966]]]]) + batch_scale = nn.ModuleList([Scale(1.0) for _ in range(3)]) + stride = 2 + training = False + cls_score = torch.randn([2, 2, 1, 1]).sigmoid() + decode_bbox = bbox_coder.decode(batch_bbox, batch_scale, stride, training, + cls_score) + + expected_bbox = torch.tensor([[[[0.6261]], [[1.4188]], [[2.3971]], + [[1.0586]], [[1.7470]], [[1.1727]], + [[0.4553]]], + [[[1.5516]], [[0.4596]], [[1.4806]], + [[1.8790]], [[1.5492]], [[1.3965]], + [[0.1966]]]]) + assert torch.allclose(decode_bbox, expected_bbox, atol=1e-3) + + # test a config with priors + prior_bbox_coder_cfg = dict( + type='FCOS3DBBoxCoder', + base_depths=((28., 13.), (25., 12.)), + base_dims=((2., 3., 1.), (1., 2., 3.)), + code_size=7, + norm_on_bbox=True) + prior_bbox_coder = build_bbox_coder(prior_bbox_coder_cfg) + + # test decode + batch_bbox = torch.tensor([[[[0.3130]], [[0.7094]], [[0.8743]], [[0.0570]], + [[0.5579]], [[0.1593]], [[0.4553]]], + [[[0.7758]], [[0.2298]], [[0.3925]], [[0.6307]], + [[0.4377]], [[0.3339]], [[0.1966]]]]) + batch_scale = nn.ModuleList([Scale(1.0) for _ in range(3)]) + stride = 2 + training = False + cls_score = torch.tensor([[[[0.5811]], [[0.6198]]], [[[0.4889]], + [[0.8142]]]]) + decode_bbox = prior_bbox_coder.decode(batch_bbox, batch_scale, stride, + training, cls_score) + expected_bbox = torch.tensor([[[[0.6260]], [[1.4188]], [[35.4916]], + [[1.0587]], [[3.4940]], [[3.5181]], + [[0.4553]]], + [[[1.5516]], [[0.4596]], [[29.7100]], + [[1.8789]], [[3.0983]], [[4.1892]], + [[0.1966]]]]) + assert torch.allclose(decode_bbox, expected_bbox, atol=1e-3) + + # test decode_yaw + decode_bbox = decode_bbox.permute(0, 2, 3, 1).view(-1, 7) + batch_centers2d = torch.tensor([[100., 150.], [200., 100.]]) + batch_dir_cls = torch.tensor([0., 1.]) + dir_offset = 0.7854 + cam2img = torch.tensor([[700., 0., 450., 0.], [0., 700., 200., 0.], + [0., 0., 1., 0.], [0., 0., 0., 1.]]) + decode_bbox = prior_bbox_coder.decode_yaw(decode_bbox, batch_centers2d, + batch_dir_cls, dir_offset, + cam2img) + expected_bbox = torch.tensor( + [[0.6260, 1.4188, 35.4916, 1.0587, 3.4940, 3.5181, 3.1332], + [1.5516, 0.4596, 29.7100, 1.8789, 3.0983, 4.1892, 6.1368]]) + assert torch.allclose(decode_bbox, expected_bbox, atol=1e-3) + + +def test_pgd_bbox_coder(): + # test a config without priors + bbox_coder_cfg = dict( + type='PGDBBoxCoder', + base_depths=None, + base_dims=None, + code_size=7, + norm_on_bbox=True) + bbox_coder = build_bbox_coder(bbox_coder_cfg) + + # test decode_2d + # [2, 27, 1, 1] + batch_bbox = torch.tensor([[[[0.0103]], [[0.7394]], [[0.3296]], [[0.4708]], + [[0.1439]], [[0.0778]], [[0.9399]], [[0.8366]], + [[0.1264]], [[0.3030]], [[0.1898]], [[0.0714]], + [[0.4144]], [[0.4341]], [[0.6442]], [[0.2951]], + [[0.2890]], [[0.4486]], [[0.2848]], [[0.1071]], + [[0.9530]], [[0.9460]], [[0.3822]], [[0.9320]], + [[0.2611]], [[0.5580]], [[0.0397]]], + [[[0.8612]], [[0.1680]], [[0.5167]], [[0.8502]], + [[0.0377]], [[0.3615]], [[0.9550]], [[0.5219]], + [[0.1402]], [[0.6843]], [[0.2121]], [[0.9468]], + [[0.6238]], [[0.7918]], [[0.1646]], [[0.0500]], + [[0.6290]], [[0.3956]], [[0.2901]], [[0.4612]], + [[0.7333]], [[0.1194]], [[0.6999]], [[0.3980]], + [[0.3262]], [[0.7185]], [[0.4474]]]]) + batch_scale = nn.ModuleList([Scale(1.0) for _ in range(5)]) + stride = 2 + training = False + cls_score = torch.randn([2, 2, 1, 1]).sigmoid() + decode_bbox = bbox_coder.decode(batch_bbox, batch_scale, stride, training, + cls_score) + max_regress_range = 16 + pred_keypoints = True + pred_bbox2d = True + decode_bbox_w2d = bbox_coder.decode_2d(decode_bbox, batch_scale, stride, + max_regress_range, training, + pred_keypoints, pred_bbox2d) + expected_decode_bbox_w2d = torch.tensor( + [[[[0.0206]], [[1.4788]], + [[1.3904]], [[1.6013]], [[1.1548]], [[1.0809]], [[0.9399]], + [[10.9441]], [[2.0117]], [[4.7049]], [[3.0009]], [[1.1405]], + [[6.2752]], [[6.5399]], [[9.0840]], [[4.5892]], [[4.4994]], + [[6.7320]], [[4.4375]], [[1.7071]], [[11.8582]], [[11.8075]], + [[5.8339]], [[1.8640]], [[0.5222]], [[1.1160]], [[0.0794]]], + [[[1.7224]], [[0.3360]], [[1.6765]], [[2.3401]], [[1.0384]], + [[1.4355]], [[0.9550]], [[7.6666]], [[2.2286]], [[9.5089]], + [[3.3436]], [[11.8133]], [[8.8603]], [[10.5508]], [[2.6101]], + [[0.7993]], [[8.9178]], [[6.0188]], [[4.5156]], [[6.8970]], + [[10.0013]], [[1.9014]], [[9.6689]], [[0.7960]], [[0.6524]], + [[1.4370]], [[0.8948]]]]) + assert torch.allclose(expected_decode_bbox_w2d, decode_bbox_w2d, atol=1e-3) + + # test decode_prob_depth + # [10, 8] + depth_cls_preds = torch.tensor([ + [-0.4383, 0.7207, -0.4092, 0.4649, 0.8526, 0.6186, -1.4312, -0.7150], + [0.0621, 0.2369, 0.5170, 0.8484, -0.1099, 0.1829, -0.0072, 1.0618], + [-1.6114, -0.1057, 0.5721, -0.5986, -2.0471, 0.8140, -0.8385, -0.4822], + [0.0742, -0.3261, 0.4607, 1.8155, -0.3571, -0.0234, 0.3787, 2.3251], + [1.0492, -0.6881, -0.0136, -1.8291, 0.8460, -1.0171, 2.5691, -0.8114], + [0.0968, -0.5601, 1.0458, 0.2560, 1.3018, 0.1635, 0.0680, -1.0263], + [-0.0765, 0.1498, -2.7321, 1.0047, -0.2505, 0.0871, -0.4820, -0.3003], + [-0.4123, 0.2298, -0.1330, -0.6008, 0.6526, 0.7118, 0.9728, -0.7793], + [1.6940, 0.3355, 1.4661, 0.5477, 0.8667, 0.0527, -0.9975, -0.0689], + [0.4724, -0.3632, -0.0654, 0.4034, -0.3494, -0.7548, 0.7297, 1.2754] + ]) + depth_range = (0, 70) + depth_unit = 10 + num_depth_cls = 8 + uniform_prob_depth_preds = bbox_coder.decode_prob_depth( + depth_cls_preds, depth_range, depth_unit, 'uniform', num_depth_cls) + expected_preds = torch.tensor([ + 32.0441, 38.4689, 36.1831, 48.2096, 46.1560, 32.7973, 33.2155, 39.9822, + 21.9905, 43.0161 + ]) + assert torch.allclose(uniform_prob_depth_preds, expected_preds, atol=1e-3) + + linear_prob_depth_preds = bbox_coder.decode_prob_depth( + depth_cls_preds, depth_range, depth_unit, 'linear', num_depth_cls) + expected_preds = torch.tensor([ + 21.1431, 30.2421, 25.8964, 41.6116, 38.6234, 21.4582, 23.2993, 30.1111, + 13.9273, 36.8419 + ]) + assert torch.allclose(linear_prob_depth_preds, expected_preds, atol=1e-3) + + log_prob_depth_preds = bbox_coder.decode_prob_depth( + depth_cls_preds, depth_range, depth_unit, 'log', num_depth_cls) + expected_preds = torch.tensor([ + 12.6458, 24.2487, 17.4015, 36.9375, 27.5982, 12.5510, 15.6635, 19.8408, + 9.1605, 31.3765 + ]) + assert torch.allclose(log_prob_depth_preds, expected_preds, atol=1e-3) + + loguniform_prob_depth_preds = bbox_coder.decode_prob_depth( + depth_cls_preds, depth_range, depth_unit, 'loguniform', num_depth_cls) + expected_preds = torch.tensor([ + 6.9925, 10.3273, 8.9895, 18.6524, 16.4667, 7.3196, 7.5078, 11.3207, + 3.7987, 13.6095 + ]) + assert torch.allclose( + loguniform_prob_depth_preds, expected_preds, atol=1e-3) + + +def test_smoke_bbox_coder(): + bbox_coder_cfg = dict( + type='SMOKECoder', + base_depth=(28.01, 16.32), + base_dims=((3.88, 1.63, 1.53), (1.78, 1.70, 0.58), (0.88, 1.73, 0.67)), + code_size=7) + + bbox_coder = build_bbox_coder(bbox_coder_cfg) + regression = torch.rand([200, 8]) + points = torch.rand([200, 2]) + labels = torch.ones([2, 100]) + cam2imgs = torch.rand([2, 4, 4]) + trans_mats = torch.rand([2, 3, 3]) + + img_metas = [dict(box_type_3d=CameraInstance3DBoxes) for i in range(2)] + locations, dimensions, orientations = bbox_coder.decode( + regression, points, labels, cam2imgs, trans_mats) + assert locations.shape == torch.Size([200, 3]) + assert dimensions.shape == torch.Size([200, 3]) + assert orientations.shape == torch.Size([200, 1]) + bboxes = bbox_coder.encode(locations, dimensions, orientations, img_metas) + assert bboxes.tensor.shape == torch.Size([200, 7]) + + # specically designed to test orientation decode function's + # special cases. + ori_vector = torch.tensor([[-0.9, -0.01], [-0.9, 0.01]]) + locations = torch.tensor([[15., 2., 1.], [15., 2., -1.]]) + orientations = bbox_coder._decode_orientation(ori_vector, locations) + assert orientations.shape == torch.Size([2, 1]) + + +def test_monoflex_bbox_coder(): + bbox_coder_cfg = dict( + type='MonoFlexCoder', + depth_mode='exp', + base_depth=(26.494627, 16.05988), + depth_range=[0.1, 100], + combine_depth=True, + uncertainty_range=[-10, 10], + base_dims=((3.8840, 1.5261, 1.6286, 0.4259, 0.1367, + 0.1022), (0.8423, 1.7607, 0.6602, 0.2349, 0.1133, 0.1427), + (1.7635, 1.7372, 0.5968, 0.1766, 0.0948, 0.1242)), + dims_mode='linear', + multibin=True, + num_dir_bins=4, + bin_centers=[0, np.pi / 2, np.pi, -np.pi / 2], + bin_margin=np.pi / 6, + code_size=7) + bbox_coder = build_bbox_coder(bbox_coder_cfg) + gt_bboxes_3d = CameraInstance3DBoxes(torch.rand([6, 7])) + orientation_target = bbox_coder.encode(gt_bboxes_3d) + assert orientation_target.shape == torch.Size([6, 8]) + + regression = torch.rand([100, 50]) + base_centers2d = torch.rand([100, 2]) + labels = torch.ones([100]) + downsample_ratio = 4 + cam2imgs = torch.rand([100, 4, 4]) + + preds = bbox_coder.decode(regression, base_centers2d, labels, + downsample_ratio, cam2imgs) + + assert preds['bboxes2d'].shape == torch.Size([100, 4]) + assert preds['dimensions'].shape == torch.Size([100, 3]) + assert preds['offsets2d'].shape == torch.Size([100, 2]) + assert preds['keypoints2d'].shape == torch.Size([100, 10, 2]) + assert preds['orientations'].shape == torch.Size([100, 16]) + assert preds['direct_depth'].shape == torch.Size([ + 100, + ]) + assert preds['keypoints_depth'].shape == torch.Size([100, 3]) + assert preds['combined_depth'].shape == torch.Size([ + 100, + ]) + assert preds['direct_depth_uncertainty'].shape == torch.Size([ + 100, + ]) + assert preds['keypoints_depth_uncertainty'].shape == torch.Size([100, 3]) + + offsets_2d = torch.randn([100, 2]) + depths = torch.randn([ + 100, + ]) + locations = bbox_coder.decode_location(base_centers2d, offsets_2d, depths, + cam2imgs, downsample_ratio) + assert locations.shape == torch.Size([100, 3]) + + orientations = torch.randn([100, 16]) + yaws, local_yaws = bbox_coder.decode_orientation(orientations, locations) + assert yaws.shape == torch.Size([ + 100, + ]) + assert local_yaws.shape == torch.Size([ + 100, + ]) diff --git a/tests/test_utils/test_box3d.py b/tests/test_utils/test_box3d.py index b78883290c..733a9e7b7a 100644 --- a/tests/test_utils/test_box3d.py +++ b/tests/test_utils/test_box3d.py @@ -1,13 +1,14 @@ # Copyright (c) OpenMMLab. All rights reserved. +import unittest + import numpy as np import pytest import torch -import unittest from mmdet3d.core.bbox import (BaseInstance3DBoxes, Box3DMode, - CameraInstance3DBoxes, DepthInstance3DBoxes, - LiDARInstance3DBoxes, bbox3d2roi, - bbox3d_mapping_back) + CameraInstance3DBoxes, Coord3DMode, + DepthInstance3DBoxes, LiDARInstance3DBoxes, + bbox3d2roi, bbox3d_mapping_back) from mmdet3d.core.bbox.structures.utils import (get_box_type, limit_period, points_cam2img, rotation_3d_in_axis, @@ -140,10 +141,15 @@ def test_lidar_boxes3d(): assert torch.allclose(expected_tensor, bottom_center_box.tensor) # Test init with numpy array - np_boxes = np.array( - [[1.7802081, 2.516249, -1.7501148, 1.75, 3.39, 1.65, 1.48], - [8.959413, 2.4567227, -1.6357126, 1.54, 4.01, 1.57, 1.62]], - dtype=np.float32) + np_boxes = np.array([[ + 1.7802081, 2.516249, -1.7501148, 1.75, 3.39, 1.65, + 1.48 - 0.13603681398218053 * 4 + ], + [ + 8.959413, 2.4567227, -1.6357126, 1.54, 4.01, 1.57, + 1.62 - 0.13603681398218053 * 4 + ]], + dtype=np.float32) boxes_1 = LiDARInstance3DBoxes(np_boxes) assert torch.allclose(boxes_1.tensor, torch.from_numpy(np_boxes)) @@ -157,15 +163,15 @@ def test_lidar_boxes3d(): th_boxes = torch.tensor( [[ 28.29669987, -0.5557558, -1.30332506, 1.47000003, 2.23000002, - 1.48000002, -1.57000005 + 1.48000002, -1.57000005 - 0.13603681398218053 * 4 ], [ 26.66901946, 21.82302134, -1.73605708, 1.55999994, 3.48000002, - 1.39999998, -1.69000006 + 1.39999998, -1.69000006 - 0.13603681398218053 * 4 ], [ 31.31977974, 8.16214412, -1.62177875, 1.74000001, 3.76999998, - 1.48000002, 2.78999996 + 1.48000002, 2.78999996 - 0.13603681398218053 * 4 ]], dtype=torch.float32) boxes_2 = LiDARInstance3DBoxes(th_boxes) @@ -176,12 +182,30 @@ def test_lidar_boxes3d(): boxes_1 = boxes_1.to(boxes_2.device) # test box concatenation - expected_tensor = torch.tensor( - [[1.7802081, 2.516249, -1.7501148, 1.75, 3.39, 1.65, 1.48], - [8.959413, 2.4567227, -1.6357126, 1.54, 4.01, 1.57, 1.62], - [28.2967, -0.5557558, -1.303325, 1.47, 2.23, 1.48, -1.57], - [26.66902, 21.82302, -1.736057, 1.56, 3.48, 1.4, -1.69], - [31.31978, 8.162144, -1.6217787, 1.74, 3.77, 1.48, 2.79]]) + expected_tensor = torch.tensor([[ + 1.7802081, 2.516249, -1.7501148, 1.75, 3.39, 1.65, + 1.48 - 0.13603681398218053 * 4 + ], + [ + 8.959413, 2.4567227, -1.6357126, 1.54, + 4.01, 1.57, + 1.62 - 0.13603681398218053 * 4 + ], + [ + 28.2967, -0.5557558, -1.303325, 1.47, + 2.23, 1.48, + -1.57 - 0.13603681398218053 * 4 + ], + [ + 26.66902, 21.82302, -1.736057, 1.56, + 3.48, 1.4, + -1.69 - 0.13603681398218053 * 4 + ], + [ + 31.31978, 8.162144, -1.6217787, 1.74, + 3.77, 1.48, + 2.79 - 0.13603681398218053 * 4 + ]]) boxes = LiDARInstance3DBoxes.cat([boxes_1, boxes_2]) assert torch.allclose(boxes.tensor, expected_tensor) # concatenate empty list @@ -196,11 +220,26 @@ def test_lidar_boxes3d(): [0.6533, -0.5520, -0.5265], [4.5870, 0.5358, -1.4741]]) expected_tensor = torch.tensor( - [[1.7802081, -2.516249, -1.7501148, 1.75, 3.39, 1.65, 1.6615927], - [8.959413, -2.4567227, -1.6357126, 1.54, 4.01, 1.57, 1.5215927], - [28.2967, 0.5557558, -1.303325, 1.47, 2.23, 1.48, 4.7115927], - [26.66902, -21.82302, -1.736057, 1.56, 3.48, 1.4, 4.8315926], - [31.31978, -8.162144, -1.6217787, 1.74, 3.77, 1.48, 0.35159278]]) + [[ + 1.7802081, -2.516249, -1.7501148, 1.75, 3.39, 1.65, + 1.6615927 - np.pi + 0.13603681398218053 * 4 + ], + [ + 8.959413, -2.4567227, -1.6357126, 1.54, 4.01, 1.57, + 1.5215927 - np.pi + 0.13603681398218053 * 4 + ], + [ + 28.2967, 0.5557558, -1.303325, 1.47, 2.23, 1.48, + 4.7115927 - np.pi + 0.13603681398218053 * 4 + ], + [ + 26.66902, -21.82302, -1.736057, 1.56, 3.48, 1.4, + 4.8315926 - np.pi + 0.13603681398218053 * 4 + ], + [ + 31.31978, -8.162144, -1.6217787, 1.74, 3.77, 1.48, + 0.35159278 - np.pi + 0.13603681398218053 * 4 + ]]) expected_points = torch.tensor([[1.2559, 0.6762, -1.4658], [4.7814, 0.8784, -1.3857], [6.7053, -0.2517, -0.9697], @@ -211,11 +250,26 @@ def test_lidar_boxes3d(): assert torch.allclose(points, expected_points, 1e-3) expected_tensor = torch.tensor( - [[-1.7802, -2.5162, -1.7501, 1.7500, 3.3900, 1.6500, -1.6616], - [-8.9594, -2.4567, -1.6357, 1.5400, 4.0100, 1.5700, -1.5216], - [-28.2967, 0.5558, -1.3033, 1.4700, 2.2300, 1.4800, -4.7116], - [-26.6690, -21.8230, -1.7361, 1.5600, 3.4800, 1.4000, -4.8316], - [-31.3198, -8.1621, -1.6218, 1.7400, 3.7700, 1.4800, -0.3516]]) + [[ + -1.7802, -2.5162, -1.7501, 1.7500, 3.3900, 1.6500, + -1.6616 + np.pi * 2 - 0.13603681398218053 * 4 + ], + [ + -8.9594, -2.4567, -1.6357, 1.5400, 4.0100, 1.5700, + -1.5216 + np.pi * 2 - 0.13603681398218053 * 4 + ], + [ + -28.2967, 0.5558, -1.3033, 1.4700, 2.2300, 1.4800, + -4.7116 + np.pi * 2 - 0.13603681398218053 * 4 + ], + [ + -26.6690, -21.8230, -1.7361, 1.5600, 3.4800, 1.4000, + -4.8316 + np.pi * 2 - 0.13603681398218053 * 4 + ], + [ + -31.3198, -8.1621, -1.6218, 1.7400, 3.7700, 1.4800, + -0.3516 + np.pi * 2 - 0.13603681398218053 * 4 + ]]) boxes_flip_vert = boxes.clone() points = boxes_flip_vert.flip('vertical', points) expected_points = torch.tensor([[-1.2559, 0.6762, -1.4658], @@ -229,12 +283,27 @@ def test_lidar_boxes3d(): # test box rotation # with input torch.Tensor points and angle expected_tensor = torch.tensor( - [[1.4225, -2.7344, -1.7501, 1.7500, 3.3900, 1.6500, 1.7976], - [8.5435, -3.6491, -1.6357, 1.5400, 4.0100, 1.5700, 1.6576], - [28.1106, -3.2869, -1.3033, 1.4700, 2.2300, 1.4800, 4.8476], - [23.4630, -25.2382, -1.7361, 1.5600, 3.4800, 1.4000, 4.9676], - [29.9235, -12.3342, -1.6218, 1.7400, 3.7700, 1.4800, 0.4876]]) - points, rot_mat_T = boxes.rotate(0.13603681398218053, points) + [[ + 1.4225, -2.7344, -1.7501, 1.7500, 3.3900, 1.6500, + 1.7976 - np.pi + 0.13603681398218053 * 2 + ], + [ + 8.5435, -3.6491, -1.6357, 1.5400, 4.0100, 1.5700, + 1.6576 - np.pi + 0.13603681398218053 * 2 + ], + [ + 28.1106, -3.2869, -1.3033, 1.4700, 2.2300, 1.4800, + 4.8476 - np.pi + 0.13603681398218053 * 2 + ], + [ + 23.4630, -25.2382, -1.7361, 1.5600, 3.4800, 1.4000, + 4.9676 - np.pi + 0.13603681398218053 * 2 + ], + [ + 29.9235, -12.3342, -1.6218, 1.7400, 3.7700, 1.4800, + 0.4876 - np.pi + 0.13603681398218053 * 2 + ]]) + points, rot_mat_T = boxes.rotate(-0.13603681398218053, points) expected_points = torch.tensor([[-1.1526, 0.8403, -1.4658], [-4.6181, 1.5187, -1.3857], [-6.6775, 0.6600, -0.9697], @@ -248,7 +317,7 @@ def test_lidar_boxes3d(): assert torch.allclose(rot_mat_T, expected_rot_mat_T, 1e-3) # with input torch.Tensor points and rotation matrix - points, rot_mat_T = boxes.rotate(-0.13603681398218053, points) # back + points, rot_mat_T = boxes.rotate(0.13603681398218053, points) # back rot_mat = np.array([[0.99076125, -0.13561762, 0.], [0.13561762, 0.99076125, 0.], [0., 0., 1.]]) points, rot_mat_T = boxes.rotate(rot_mat, points) @@ -262,7 +331,7 @@ def test_lidar_boxes3d(): [-6.5263, 1.5595, -0.9697], [-0.4809, 0.7073, -0.5265], [-4.5623, 0.7166, -1.4741]]) - points_np, rot_mat_T_np = boxes.rotate(0.13603681398218053, points_np) + points_np, rot_mat_T_np = boxes.rotate(-0.13603681398218053, points_np) expected_points_np = np.array([[-0.8844, 1.1191, -1.4658], [-4.0401, 2.7039, -1.3857], [-6.2545, 2.4302, -0.9697], @@ -276,7 +345,7 @@ def test_lidar_boxes3d(): assert np.allclose(rot_mat_T_np, expected_rot_mat_T_np, 1e-3) # with input LiDARPoints and rotation matrix - points_np, rot_mat_T_np = boxes.rotate(-0.13603681398218053, points_np) + points_np, rot_mat_T_np = boxes.rotate(0.13603681398218053, points_np) lidar_points = LiDARPoints(points_np) lidar_points, rot_mat_T_np = boxes.rotate(rot_mat, lidar_points) points_np = lidar_points.tensor.numpy() @@ -287,27 +356,27 @@ def test_lidar_boxes3d(): # test box scaling expected_tensor = torch.tensor([[ 1.0443488, -2.9183323, -1.7599131, 1.7597977, 3.4089797, 1.6592377, - 1.9336663 + 1.9336663 - np.pi ], [ 8.014273, -4.8007393, -1.6448704, 1.5486219, 4.0324507, 1.57879, - 1.7936664 + 1.7936664 - np.pi ], [ 27.558605, -7.1084175, -1.310622, 1.4782301, 2.242485, 1.488286, - 4.9836664 + 4.9836664 - np.pi ], [ 19.934517, -28.344835, -1.7457767, 1.5687338, 3.4994833, 1.4078381, - 5.1036663 + 5.1036663 - np.pi ], [ 28.130915, -16.369587, -1.6308585, 1.7497417, 3.791107, 1.488286, - 0.6236664 + 0.6236664 - np.pi ]]) boxes.scale(1.00559866335275) assert torch.allclose(boxes.tensor, expected_tensor) @@ -315,32 +384,39 @@ def test_lidar_boxes3d(): # test box translation expected_tensor = torch.tensor([[ 1.1281544, -3.0507944, -1.9169292, 1.7597977, 3.4089797, 1.6592377, - 1.9336663 + 1.9336663 - np.pi ], [ 8.098079, -4.9332013, -1.8018866, 1.5486219, 4.0324507, 1.57879, - 1.7936664 + 1.7936664 - np.pi ], [ 27.64241, -7.2408795, -1.4676381, 1.4782301, 2.242485, 1.488286, - 4.9836664 + 4.9836664 - np.pi ], [ 20.018322, -28.477297, -1.9027928, 1.5687338, 3.4994833, 1.4078381, - 5.1036663 + 5.1036663 - np.pi ], [ 28.21472, -16.502048, -1.7878747, 1.7497417, 3.791107, 1.488286, - 0.6236664 + 0.6236664 - np.pi ]]) boxes.translate([0.0838056, -0.13246193, -0.15701613]) assert torch.allclose(boxes.tensor, expected_tensor) # test bbox in_range_bev + expected_tensor = torch.tensor( + [[1.1282, -3.0508, 1.7598, 3.4090, -1.2079], + [8.0981, -4.9332, 1.5486, 4.0325, -1.3479], + [27.6424, -7.2409, 1.4782, 2.2425, 1.8421], + [20.0183, -28.4773, 1.5687, 3.4995, 1.9621], + [28.2147, -16.5020, 1.7497, 3.7911, -2.5179]]) + assert torch.allclose(boxes.bev, expected_tensor, atol=1e-3) expected_tensor = torch.tensor([1, 1, 1, 1, 1], dtype=torch.bool) mask = boxes.in_range_bev([0., -40., 70.4, 40.]) assert (mask == expected_tensor).all() @@ -356,17 +432,17 @@ def test_lidar_boxes3d(): index_boxes = boxes[2:5] expected_tensor = torch.tensor([[ 27.64241, -7.2408795, -1.4676381, 1.4782301, 2.242485, 1.488286, - 4.9836664 + 4.9836664 - np.pi ], [ 20.018322, -28.477297, -1.9027928, 1.5687338, 3.4994833, 1.4078381, - 5.1036663 + 5.1036663 - np.pi ], [ 28.21472, -16.502048, -1.7878747, 1.7497417, 3.791107, 1.488286, - 0.6236664 + 0.6236664 - np.pi ]]) assert len(index_boxes) == 3 assert torch.allclose(index_boxes.tensor, expected_tensor) @@ -374,7 +450,7 @@ def test_lidar_boxes3d(): index_boxes = boxes[2] expected_tensor = torch.tensor([[ 27.64241, -7.2408795, -1.4676381, 1.4782301, 2.242485, 1.488286, - 4.9836664 + 4.9836664 - np.pi ]]) assert len(index_boxes) == 1 assert torch.allclose(index_boxes.tensor, expected_tensor) @@ -382,12 +458,12 @@ def test_lidar_boxes3d(): index_boxes = boxes[[2, 4]] expected_tensor = torch.tensor([[ 27.64241, -7.2408795, -1.4676381, 1.4782301, 2.242485, 1.488286, - 4.9836664 + 4.9836664 - np.pi ], [ 28.21472, -16.502048, -1.7878747, 1.7497417, 3.791107, 1.488286, - 0.6236664 + 0.6236664 - np.pi ]]) assert len(index_boxes) == 2 assert torch.allclose(index_boxes.tensor, expected_tensor) @@ -408,13 +484,13 @@ def test_lidar_boxes3d(): assert (boxes.tensor[:, 6] >= -np.pi / 2).all() Box3DMode.convert(boxes, Box3DMode.LIDAR, Box3DMode.LIDAR) - expected_tesor = boxes.tensor.clone() - assert torch.allclose(expected_tesor, boxes.tensor) + expected_tensor = boxes.tensor.clone() + assert torch.allclose(expected_tensor, boxes.tensor) boxes.flip() boxes.flip() boxes.limit_yaw() - assert torch.allclose(expected_tesor, boxes.tensor) + assert torch.allclose(expected_tensor, boxes.tensor) # test nearest_bev expected_tensor = torch.tensor([[-0.5763, -3.9307, 2.8326, -2.1709], @@ -422,52 +498,50 @@ def test_lidar_boxes3d(): [26.5212, -7.9800, 28.7637, -6.5018], [18.2686, -29.2617, 21.7681, -27.6929], [27.3398, -18.3976, 29.0896, -14.6065]]) - # the pytorch print loses some precision assert torch.allclose( boxes.nearest_bev, expected_tensor, rtol=1e-4, atol=1e-7) - # obtained by the print of the original implementation - expected_tensor = torch.tensor([[[2.4093e+00, -4.4784e+00, -1.9169e+00], - [2.4093e+00, -4.4784e+00, -2.5769e-01], - [-7.7767e-01, -3.2684e+00, -2.5769e-01], - [-7.7767e-01, -3.2684e+00, -1.9169e+00], - [3.0340e+00, -2.8332e+00, -1.9169e+00], - [3.0340e+00, -2.8332e+00, -2.5769e-01], - [-1.5301e-01, -1.6232e+00, -2.5769e-01], - [-1.5301e-01, -1.6232e+00, -1.9169e+00]], - [[9.8933e+00, -6.1340e+00, -1.8019e+00], - [9.8933e+00, -6.1340e+00, -2.2310e-01], - [5.9606e+00, -5.2427e+00, -2.2310e-01], - [5.9606e+00, -5.2427e+00, -1.8019e+00], - [1.0236e+01, -4.6237e+00, -1.8019e+00], - [1.0236e+01, -4.6237e+00, -2.2310e-01], - [6.3029e+00, -3.7324e+00, -2.2310e-01], - [6.3029e+00, -3.7324e+00, -1.8019e+00]], - [[2.8525e+01, -8.2534e+00, -1.4676e+00], - [2.8525e+01, -8.2534e+00, 2.0648e-02], - [2.6364e+01, -7.6525e+00, 2.0648e-02], - [2.6364e+01, -7.6525e+00, -1.4676e+00], - [2.8921e+01, -6.8292e+00, -1.4676e+00], - [2.8921e+01, -6.8292e+00, 2.0648e-02], - [2.6760e+01, -6.2283e+00, 2.0648e-02], - [2.6760e+01, -6.2283e+00, -1.4676e+00]], - [[2.1337e+01, -2.9870e+01, -1.9028e+00], - [2.1337e+01, -2.9870e+01, -4.9495e-01], - [1.8102e+01, -2.8535e+01, -4.9495e-01], - [1.8102e+01, -2.8535e+01, -1.9028e+00], - [2.1935e+01, -2.8420e+01, -1.9028e+00], - [2.1935e+01, -2.8420e+01, -4.9495e-01], - [1.8700e+01, -2.7085e+01, -4.9495e-01], - [1.8700e+01, -2.7085e+01, -1.9028e+00]], - [[2.6398e+01, -1.7530e+01, -1.7879e+00], - [2.6398e+01, -1.7530e+01, -2.9959e-01], - [2.8612e+01, -1.4452e+01, -2.9959e-01], - [2.8612e+01, -1.4452e+01, -1.7879e+00], - [2.7818e+01, -1.8552e+01, -1.7879e+00], - [2.7818e+01, -1.8552e+01, -2.9959e-01], - [3.0032e+01, -1.5474e+01, -2.9959e-01], - [3.0032e+01, -1.5474e+01, -1.7879e+00]]]) - # the pytorch print loses some precision + expected_tensor = torch.tensor([[[-7.7767e-01, -2.8332e+00, -1.9169e+00], + [-7.7767e-01, -2.8332e+00, -2.5769e-01], + [2.4093e+00, -1.6232e+00, -2.5769e-01], + [2.4093e+00, -1.6232e+00, -1.9169e+00], + [-1.5301e-01, -4.4784e+00, -1.9169e+00], + [-1.5301e-01, -4.4784e+00, -2.5769e-01], + [3.0340e+00, -3.2684e+00, -2.5769e-01], + [3.0340e+00, -3.2684e+00, -1.9169e+00]], + [[5.9606e+00, -4.6237e+00, -1.8019e+00], + [5.9606e+00, -4.6237e+00, -2.2310e-01], + [9.8933e+00, -3.7324e+00, -2.2310e-01], + [9.8933e+00, -3.7324e+00, -1.8019e+00], + [6.3029e+00, -6.1340e+00, -1.8019e+00], + [6.3029e+00, -6.1340e+00, -2.2310e-01], + [1.0236e+01, -5.2427e+00, -2.2310e-01], + [1.0236e+01, -5.2427e+00, -1.8019e+00]], + [[2.6364e+01, -6.8292e+00, -1.4676e+00], + [2.6364e+01, -6.8292e+00, 2.0648e-02], + [2.8525e+01, -6.2283e+00, 2.0648e-02], + [2.8525e+01, -6.2283e+00, -1.4676e+00], + [2.6760e+01, -8.2534e+00, -1.4676e+00], + [2.6760e+01, -8.2534e+00, 2.0648e-02], + [2.8921e+01, -7.6525e+00, 2.0648e-02], + [2.8921e+01, -7.6525e+00, -1.4676e+00]], + [[1.8102e+01, -2.8420e+01, -1.9028e+00], + [1.8102e+01, -2.8420e+01, -4.9495e-01], + [2.1337e+01, -2.7085e+01, -4.9495e-01], + [2.1337e+01, -2.7085e+01, -1.9028e+00], + [1.8700e+01, -2.9870e+01, -1.9028e+00], + [1.8700e+01, -2.9870e+01, -4.9495e-01], + [2.1935e+01, -2.8535e+01, -4.9495e-01], + [2.1935e+01, -2.8535e+01, -1.9028e+00]], + [[2.8612e+01, -1.8552e+01, -1.7879e+00], + [2.8612e+01, -1.8552e+01, -2.9959e-01], + [2.6398e+01, -1.5474e+01, -2.9959e-01], + [2.6398e+01, -1.5474e+01, -1.7879e+00], + [3.0032e+01, -1.7530e+01, -1.7879e+00], + [3.0032e+01, -1.7530e+01, -2.9959e-01], + [2.7818e+01, -1.4452e+01, -2.9959e-01], + [2.7818e+01, -1.4452e+01, -1.7879e+00]]]) + assert torch.allclose(boxes.corners, expected_tensor, rtol=1e-4, atol=1e-7) # test new_box @@ -558,31 +632,32 @@ def test_boxes_conversion(): [0.000000e+00, 0.000000e+00, 0.000000e+00, 1.000000e+00]], dtype=torch.float32) + # coord sys refactor (reverse sign of yaw) expected_tensor = torch.tensor( [[ - 2.16902434e+01, -4.06038554e-02, -1.61906639e+00, 1.65999997e+00, - 3.20000005e+00, 1.61000001e+00, -1.53999996e+00 + 2.16902434e+01, -4.06038554e-02, -1.61906639e+00, 3.20000005e+00, + 1.65999997e+00, 1.61000001e+00, 1.53999996e+00 - np.pi / 2 ], [ - 7.05006905e+00, -6.57459601e+00, -1.60107949e+00, 2.27999997e+00, - 1.27799997e+01, 3.66000009e+00, 1.54999995e+00 + 7.05006905e+00, -6.57459601e+00, -1.60107949e+00, 1.27799997e+01, + 2.27999997e+00, 3.66000009e+00, -1.54999995e+00 - np.pi / 2 ], [ - 2.24698818e+01, -6.69203759e+00, -1.50118145e+00, 2.31999993e+00, - 1.47299995e+01, 3.64000010e+00, 1.59000003e+00 + 2.24698818e+01, -6.69203759e+00, -1.50118145e+00, 1.47299995e+01, + 2.31999993e+00, 3.64000010e+00, -1.59000003e+00 + 3 * np.pi / 2 ], [ - 3.48291965e+01, -7.09058388e+00, -1.36622983e+00, 2.31999993e+00, - 1.00400000e+01, 3.60999990e+00, 1.61000001e+00 + 3.48291965e+01, -7.09058388e+00, -1.36622983e+00, 1.00400000e+01, + 2.31999993e+00, 3.60999990e+00, -1.61000001e+00 + 3 * np.pi / 2 ], [ - 4.62394617e+01, -7.75838800e+00, -1.32405020e+00, 2.33999991e+00, - 1.28299999e+01, 3.63000011e+00, 1.63999999e+00 + 4.62394617e+01, -7.75838800e+00, -1.32405020e+00, 1.28299999e+01, + 2.33999991e+00, 3.63000011e+00, -1.63999999e+00 + 3 * np.pi / 2 ]], dtype=torch.float32) rt_mat = rect @ Trv2c - # test coversion with Box type + # test conversion with Box type cam_to_lidar_box = Box3DMode.convert(camera_boxes, Box3DMode.CAM, Box3DMode.LIDAR, rt_mat.inverse()) assert torch.allclose(cam_to_lidar_box.tensor, expected_tensor) @@ -637,10 +712,15 @@ def test_boxes_conversion(): def test_camera_boxes3d(): # Test init with numpy array - np_boxes = np.array( - [[1.7802081, 2.516249, -1.7501148, 1.75, 3.39, 1.65, 1.48], - [8.959413, 2.4567227, -1.6357126, 1.54, 4.01, 1.57, 1.62]], - dtype=np.float32) + np_boxes = np.array([[ + 1.7802081, 2.516249, -1.7501148, 1.75, 3.39, 1.65, + 1.48 - 0.13603681398218053 * 4 - 2 * np.pi + ], + [ + 8.959413, 2.4567227, -1.6357126, 1.54, 4.01, 1.57, + 1.62 - 0.13603681398218053 * 4 - 2 * np.pi + ]], + dtype=np.float32) boxes_1 = Box3DMode.convert( LiDARInstance3DBoxes(np_boxes), Box3DMode.LIDAR, Box3DMode.CAM) @@ -654,15 +734,15 @@ def test_camera_boxes3d(): th_boxes = torch.tensor( [[ 28.29669987, -0.5557558, -1.30332506, 1.47000003, 2.23000002, - 1.48000002, -1.57000005 + 1.48000002, -1.57000005 - 0.13603681398218053 * 4 - 2 * np.pi ], [ 26.66901946, 21.82302134, -1.73605708, 1.55999994, 3.48000002, - 1.39999998, -1.69000006 + 1.39999998, -1.69000006 - 0.13603681398218053 * 4 - 2 * np.pi ], [ 31.31977974, 8.16214412, -1.62177875, 1.74000001, 3.76999998, - 1.48000002, 2.78999996 + 1.48000002, 2.78999996 - 0.13603681398218053 * 4 - 2 * np.pi ]], dtype=torch.float32) cam_th_boxes = Box3DMode.convert(th_boxes, Box3DMode.LIDAR, Box3DMode.CAM) @@ -675,13 +755,26 @@ def test_camera_boxes3d(): # test box concatenation expected_tensor = Box3DMode.convert( - torch.tensor( - [[1.7802081, 2.516249, -1.7501148, 1.75, 3.39, 1.65, 1.48], - [8.959413, 2.4567227, -1.6357126, 1.54, 4.01, 1.57, 1.62], - [28.2967, -0.5557558, -1.303325, 1.47, 2.23, 1.48, -1.57], - [26.66902, 21.82302, -1.736057, 1.56, 3.48, 1.4, -1.69], - [31.31978, 8.162144, -1.6217787, 1.74, 3.77, 1.48, 2.79]]), - Box3DMode.LIDAR, Box3DMode.CAM) + torch.tensor([[ + 1.7802081, 2.516249, -1.7501148, 1.75, 3.39, 1.65, + 1.48 - 0.13603681398218053 * 4 - 2 * np.pi + ], + [ + 8.959413, 2.4567227, -1.6357126, 1.54, 4.01, 1.57, + 1.62 - 0.13603681398218053 * 4 - 2 * np.pi + ], + [ + 28.2967, -0.5557558, -1.303325, 1.47, 2.23, 1.48, + -1.57 - 0.13603681398218053 * 4 - 2 * np.pi + ], + [ + 26.66902, 21.82302, -1.736057, 1.56, 3.48, 1.4, + -1.69 - 0.13603681398218053 * 4 - 2 * np.pi + ], + [ + 31.31978, 8.162144, -1.6217787, 1.74, 3.77, 1.48, + 2.79 - 0.13603681398218053 * 4 - 2 * np.pi + ]]), Box3DMode.LIDAR, Box3DMode.CAM) boxes = CameraInstance3DBoxes.cat([boxes_1, boxes_2]) assert torch.allclose(boxes.tensor, expected_tensor) @@ -690,28 +783,60 @@ def test_camera_boxes3d(): [-0.2517, 0.9697, 6.7053], [0.5520, 0.5265, 0.6533], [-0.5358, 1.4741, 4.5870]]) expected_tensor = Box3DMode.convert( - torch.tensor( - [[1.7802081, -2.516249, -1.7501148, 1.75, 3.39, 1.65, 1.6615927], - [8.959413, -2.4567227, -1.6357126, 1.54, 4.01, 1.57, 1.5215927], - [28.2967, 0.5557558, -1.303325, 1.47, 2.23, 1.48, 4.7115927], - [26.66902, -21.82302, -1.736057, 1.56, 3.48, 1.4, 4.8315926], - [31.31978, -8.162144, -1.6217787, 1.74, 3.77, 1.48, 0.35159278]]), - Box3DMode.LIDAR, Box3DMode.CAM) + torch.tensor([[ + 1.7802081, -2.516249, -1.7501148, 1.75, 3.39, 1.65, + 1.6615927 + 0.13603681398218053 * 4 - np.pi + ], + [ + 8.959413, -2.4567227, -1.6357126, 1.54, 4.01, 1.57, + 1.5215927 + 0.13603681398218053 * 4 - np.pi + ], + [ + 28.2967, 0.5557558, -1.303325, 1.47, 2.23, 1.48, + 4.7115927 + 0.13603681398218053 * 4 - np.pi + ], + [ + 26.66902, -21.82302, -1.736057, 1.56, 3.48, 1.4, + 4.8315926 + 0.13603681398218053 * 4 - np.pi + ], + [ + 31.31978, -8.162144, -1.6217787, 1.74, 3.77, 1.48, + 0.35159278 + 0.13603681398218053 * 4 - np.pi + ]]), Box3DMode.LIDAR, Box3DMode.CAM) points = boxes.flip('horizontal', points) expected_points = torch.tensor([[-0.6762, 1.4658, 1.2559], [-0.8784, 1.3857, 4.7814], [0.2517, 0.9697, 6.7053], [-0.5520, 0.5265, 0.6533], [0.5358, 1.4741, 4.5870]]) - assert torch.allclose(boxes.tensor, expected_tensor) + + yaw_normalized_tensor = boxes.tensor.clone() + yaw_normalized_tensor[:, -1:] = limit_period( + yaw_normalized_tensor[:, -1:], period=np.pi * 2) + assert torch.allclose(yaw_normalized_tensor, expected_tensor, 1e-3) assert torch.allclose(points, expected_points, 1e-3) expected_tensor = torch.tensor( - [[2.5162, 1.7501, -1.7802, 3.3900, 1.6500, 1.7500, -1.6616], - [2.4567, 1.6357, -8.9594, 4.0100, 1.5700, 1.5400, -1.5216], - [-0.5558, 1.3033, -28.2967, 2.2300, 1.4800, 1.4700, -4.7116], - [21.8230, 1.7361, -26.6690, 3.4800, 1.4000, 1.5600, -4.8316], - [8.1621, 1.6218, -31.3198, 3.7700, 1.4800, 1.7400, -0.3516]]) + [[ + 2.5162, 1.7501, -1.7802, 1.7500, 1.6500, 3.3900, + 1.6616 + 0.13603681398218053 * 4 - np.pi / 2 + ], + [ + 2.4567, 1.6357, -8.9594, 1.5400, 1.5700, 4.0100, + 1.5216 + 0.13603681398218053 * 4 - np.pi / 2 + ], + [ + -0.5558, 1.3033, -28.2967, 1.4700, 1.4800, 2.2300, + 4.7116 + 0.13603681398218053 * 4 - np.pi / 2 + ], + [ + 21.8230, 1.7361, -26.6690, 1.5600, 1.4000, 3.4800, + 4.8316 + 0.13603681398218053 * 4 - np.pi / 2 + ], + [ + 8.1621, 1.6218, -31.3198, 1.7400, 1.4800, 3.7700, + 0.3516 + 0.13603681398218053 * 4 - np.pi / 2 + ]]) boxes_flip_vert = boxes.clone() points = boxes_flip_vert.flip('vertical', points) expected_points = torch.tensor([[-0.6762, 1.4658, -1.2559], @@ -719,19 +844,38 @@ def test_camera_boxes3d(): [0.2517, 0.9697, -6.7053], [-0.5520, 0.5265, -0.6533], [0.5358, 1.4741, -4.5870]]) - assert torch.allclose(boxes_flip_vert.tensor, expected_tensor, 1e-4) + + yaw_normalized_tensor = boxes_flip_vert.tensor.clone() + yaw_normalized_tensor[:, -1:] = limit_period( + yaw_normalized_tensor[:, -1:], period=np.pi * 2) + expected_tensor[:, -1:] = limit_period( + expected_tensor[:, -1:], period=np.pi * 2) + assert torch.allclose(yaw_normalized_tensor, expected_tensor, 1e-4) assert torch.allclose(points, expected_points) # test box rotation # with input torch.Tensor points and angle expected_tensor = Box3DMode.convert( - torch.tensor( - [[1.4225, -2.7344, -1.7501, 1.7500, 3.3900, 1.6500, 1.7976], - [8.5435, -3.6491, -1.6357, 1.5400, 4.0100, 1.5700, 1.6576], - [28.1106, -3.2869, -1.3033, 1.4700, 2.2300, 1.4800, 4.8476], - [23.4630, -25.2382, -1.7361, 1.5600, 3.4800, 1.4000, 4.9676], - [29.9235, -12.3342, -1.6218, 1.7400, 3.7700, 1.4800, 0.4876]]), - Box3DMode.LIDAR, Box3DMode.CAM) + torch.tensor([[ + 1.4225, -2.7344, -1.7501, 1.7500, 3.3900, 1.6500, + 1.7976 + 0.13603681398218053 * 2 - np.pi + ], + [ + 8.5435, -3.6491, -1.6357, 1.5400, 4.0100, 1.5700, + 1.6576 + 0.13603681398218053 * 2 - np.pi + ], + [ + 28.1106, -3.2869, -1.3033, 1.4700, 2.2300, 1.4800, + 4.8476 + 0.13603681398218053 * 2 - np.pi + ], + [ + 23.4630, -25.2382, -1.7361, 1.5600, 3.4800, 1.4000, + 4.9676 + 0.13603681398218053 * 2 - np.pi + ], + [ + 29.9235, -12.3342, -1.6218, 1.7400, 3.7700, 1.4800, + 0.4876 + 0.13603681398218053 * 2 - np.pi + ]]), Box3DMode.LIDAR, Box3DMode.CAM) points, rot_mat_T = boxes.rotate(torch.tensor(0.13603681398218053), points) expected_points = torch.tensor([[-0.8403, 1.4658, -1.1526], [-1.5187, 1.3857, -4.6181], @@ -741,7 +885,12 @@ def test_camera_boxes3d(): expected_rot_mat_T = torch.tensor([[0.9908, 0.0000, -0.1356], [0.0000, 1.0000, 0.0000], [0.1356, 0.0000, 0.9908]]) - assert torch.allclose(boxes.tensor, expected_tensor, 1e-3) + yaw_normalized_tensor = boxes.tensor.clone() + yaw_normalized_tensor[:, -1:] = limit_period( + yaw_normalized_tensor[:, -1:], period=np.pi * 2) + expected_tensor[:, -1:] = limit_period( + expected_tensor[:, -1:], period=np.pi * 2) + assert torch.allclose(yaw_normalized_tensor, expected_tensor, 1e-3) assert torch.allclose(points, expected_points, 1e-3) assert torch.allclose(rot_mat_T, expected_rot_mat_T, 1e-3) @@ -751,7 +900,10 @@ def test_camera_boxes3d(): rot_mat = np.array([[0.99076125, 0., -0.13561762], [0., 1., 0.], [0.13561762, 0., 0.99076125]]) points, rot_mat_T = boxes.rotate(rot_mat, points) - assert torch.allclose(boxes.tensor, expected_tensor, 1e-3) + yaw_normalized_tensor = boxes.tensor.clone() + yaw_normalized_tensor[:, -1:] = limit_period( + yaw_normalized_tensor[:, -1:], period=np.pi * 2) + assert torch.allclose(yaw_normalized_tensor, expected_tensor, 1e-3) assert torch.allclose(points, expected_points, 1e-3) assert torch.allclose(rot_mat_T, expected_rot_mat_T, 1e-3) @@ -788,51 +940,61 @@ def test_camera_boxes3d(): expected_tensor = Box3DMode.convert( torch.tensor([[ 1.0443488, -2.9183323, -1.7599131, 1.7597977, 3.4089797, 1.6592377, - 1.9336663 + 1.9336663 - np.pi ], [ 8.014273, -4.8007393, -1.6448704, 1.5486219, - 4.0324507, 1.57879, 1.7936664 + 4.0324507, 1.57879, 1.7936664 - np.pi ], [ 27.558605, -7.1084175, -1.310622, 1.4782301, - 2.242485, 1.488286, 4.9836664 + 2.242485, 1.488286, 4.9836664 - np.pi ], [ 19.934517, -28.344835, -1.7457767, 1.5687338, - 3.4994833, 1.4078381, 5.1036663 + 3.4994833, 1.4078381, 5.1036663 - np.pi ], [ 28.130915, -16.369587, -1.6308585, 1.7497417, - 3.791107, 1.488286, 0.6236664 + 3.791107, 1.488286, 0.6236664 - np.pi ]]), Box3DMode.LIDAR, Box3DMode.CAM) boxes.scale(1.00559866335275) - assert torch.allclose(boxes.tensor, expected_tensor) + yaw_normalized_tensor = boxes.tensor.clone() + yaw_normalized_tensor[:, -1:] = limit_period( + yaw_normalized_tensor[:, -1:], period=np.pi * 2) + expected_tensor[:, -1:] = limit_period( + expected_tensor[:, -1:], period=np.pi * 2) + assert torch.allclose(yaw_normalized_tensor, expected_tensor) # test box translation expected_tensor = Box3DMode.convert( torch.tensor([[ 1.1281544, -3.0507944, -1.9169292, 1.7597977, 3.4089797, 1.6592377, - 1.9336663 + 1.9336663 - np.pi ], [ 8.098079, -4.9332013, -1.8018866, 1.5486219, - 4.0324507, 1.57879, 1.7936664 + 4.0324507, 1.57879, 1.7936664 - np.pi ], [ 27.64241, -7.2408795, -1.4676381, 1.4782301, - 2.242485, 1.488286, 4.9836664 + 2.242485, 1.488286, 4.9836664 - np.pi ], [ 20.018322, -28.477297, -1.9027928, 1.5687338, - 3.4994833, 1.4078381, 5.1036663 + 3.4994833, 1.4078381, 5.1036663 - np.pi ], [ 28.21472, -16.502048, -1.7878747, 1.7497417, - 3.791107, 1.488286, 0.6236664 + 3.791107, 1.488286, 0.6236664 - np.pi ]]), Box3DMode.LIDAR, Box3DMode.CAM) boxes.translate(torch.tensor([0.13246193, 0.15701613, 0.0838056])) - assert torch.allclose(boxes.tensor, expected_tensor) + yaw_normalized_tensor = boxes.tensor.clone() + yaw_normalized_tensor[:, -1:] = limit_period( + yaw_normalized_tensor[:, -1:], period=np.pi * 2) + expected_tensor[:, -1:] = limit_period( + expected_tensor[:, -1:], period=np.pi * 2) + assert torch.allclose(yaw_normalized_tensor, expected_tensor) # test bbox in_range_bev expected_tensor = torch.tensor([1, 1, 1, 1, 1], dtype=torch.bool) @@ -846,6 +1008,14 @@ def test_camera_boxes3d(): mask = boxes.in_range_3d([-2, -5, 0, 20, 2, 22]) assert (mask == expected_tensor).all() + expected_tensor = torch.tensor( + [[3.0508, 1.1282, 1.7598, 3.4090, -5.9203], + [4.9332, 8.0981, 1.5486, 4.0325, -6.0603], + [7.2409, 27.6424, 1.4782, 2.2425, -2.8703], + [28.4773, 20.0183, 1.5687, 3.4995, -2.7503], + [16.5020, 28.2147, 1.7497, 3.7911, -0.9471]]) + assert torch.allclose(boxes.bev, expected_tensor, atol=1e-3) + # test properties assert torch.allclose(boxes.bottom_center, boxes.tensor[:, :3]) expected_tensor = ( @@ -858,13 +1028,13 @@ def test_camera_boxes3d(): assert (boxes.tensor[:, 6] >= -np.pi / 2).all() Box3DMode.convert(boxes, Box3DMode.LIDAR, Box3DMode.LIDAR) - expected_tesor = boxes.tensor.clone() - assert torch.allclose(expected_tesor, boxes.tensor) + expected_tensor = boxes.tensor.clone() + assert torch.allclose(expected_tensor, boxes.tensor) boxes.flip() boxes.flip() boxes.limit_yaw() - assert torch.allclose(expected_tesor, boxes.tensor) + assert torch.allclose(expected_tensor, boxes.tensor) # test nearest_bev # BEV box in lidar coordinates (x, y) @@ -878,54 +1048,66 @@ def test_camera_boxes3d(): expected_tensor = lidar_expected_tensor.clone() expected_tensor[:, 0::2] = -lidar_expected_tensor[:, [3, 1]] expected_tensor[:, 1::2] = lidar_expected_tensor[:, 0::2] - # the pytorch print loses some precision assert torch.allclose( boxes.nearest_bev, expected_tensor, rtol=1e-4, atol=1e-7) - # obtained by the print of the original implementation - expected_tensor = torch.tensor([[[3.2684e+00, 2.5769e-01, -7.7767e-01], - [1.6232e+00, 2.5769e-01, -1.5301e-01], - [1.6232e+00, 1.9169e+00, -1.5301e-01], - [3.2684e+00, 1.9169e+00, -7.7767e-01], - [4.4784e+00, 2.5769e-01, 2.4093e+00], - [2.8332e+00, 2.5769e-01, 3.0340e+00], - [2.8332e+00, 1.9169e+00, 3.0340e+00], - [4.4784e+00, 1.9169e+00, 2.4093e+00]], - [[5.2427e+00, 2.2310e-01, 5.9606e+00], - [3.7324e+00, 2.2310e-01, 6.3029e+00], - [3.7324e+00, 1.8019e+00, 6.3029e+00], - [5.2427e+00, 1.8019e+00, 5.9606e+00], - [6.1340e+00, 2.2310e-01, 9.8933e+00], - [4.6237e+00, 2.2310e-01, 1.0236e+01], - [4.6237e+00, 1.8019e+00, 1.0236e+01], - [6.1340e+00, 1.8019e+00, 9.8933e+00]], - [[7.6525e+00, -2.0648e-02, 2.6364e+01], - [6.2283e+00, -2.0648e-02, 2.6760e+01], - [6.2283e+00, 1.4676e+00, 2.6760e+01], - [7.6525e+00, 1.4676e+00, 2.6364e+01], - [8.2534e+00, -2.0648e-02, 2.8525e+01], - [6.8292e+00, -2.0648e-02, 2.8921e+01], - [6.8292e+00, 1.4676e+00, 2.8921e+01], - [8.2534e+00, 1.4676e+00, 2.8525e+01]], - [[2.8535e+01, 4.9495e-01, 1.8102e+01], - [2.7085e+01, 4.9495e-01, 1.8700e+01], - [2.7085e+01, 1.9028e+00, 1.8700e+01], - [2.8535e+01, 1.9028e+00, 1.8102e+01], - [2.9870e+01, 4.9495e-01, 2.1337e+01], - [2.8420e+01, 4.9495e-01, 2.1935e+01], - [2.8420e+01, 1.9028e+00, 2.1935e+01], - [2.9870e+01, 1.9028e+00, 2.1337e+01]], - [[1.4452e+01, 2.9959e-01, 2.8612e+01], - [1.5474e+01, 2.9959e-01, 3.0032e+01], - [1.5474e+01, 1.7879e+00, 3.0032e+01], - [1.4452e+01, 1.7879e+00, 2.8612e+01], - [1.7530e+01, 2.9959e-01, 2.6398e+01], - [1.8552e+01, 2.9959e-01, 2.7818e+01], - [1.8552e+01, 1.7879e+00, 2.7818e+01], - [1.7530e+01, 1.7879e+00, 2.6398e+01]]]) - - # the pytorch print loses some precision - assert torch.allclose(boxes.corners, expected_tensor, rtol=1e-4, atol=1e-7) + expected_tensor = torch.tensor([[[2.8332e+00, 2.5769e-01, -7.7767e-01], + [1.6232e+00, 2.5769e-01, 2.4093e+00], + [1.6232e+00, 1.9169e+00, 2.4093e+00], + [2.8332e+00, 1.9169e+00, -7.7767e-01], + [4.4784e+00, 2.5769e-01, -1.5302e-01], + [3.2684e+00, 2.5769e-01, 3.0340e+00], + [3.2684e+00, 1.9169e+00, 3.0340e+00], + [4.4784e+00, 1.9169e+00, -1.5302e-01]], + [[4.6237e+00, 2.2310e-01, 5.9606e+00], + [3.7324e+00, 2.2310e-01, 9.8933e+00], + [3.7324e+00, 1.8019e+00, 9.8933e+00], + [4.6237e+00, 1.8019e+00, 5.9606e+00], + [6.1340e+00, 2.2310e-01, 6.3029e+00], + [5.2427e+00, 2.2310e-01, 1.0236e+01], + [5.2427e+00, 1.8019e+00, 1.0236e+01], + [6.1340e+00, 1.8019e+00, 6.3029e+00]], + [[6.8292e+00, -2.0648e-02, 2.6364e+01], + [6.2283e+00, -2.0648e-02, 2.8525e+01], + [6.2283e+00, 1.4676e+00, 2.8525e+01], + [6.8292e+00, 1.4676e+00, 2.6364e+01], + [8.2534e+00, -2.0648e-02, 2.6760e+01], + [7.6525e+00, -2.0648e-02, 2.8921e+01], + [7.6525e+00, 1.4676e+00, 2.8921e+01], + [8.2534e+00, 1.4676e+00, 2.6760e+01]], + [[2.8420e+01, 4.9495e-01, 1.8102e+01], + [2.7085e+01, 4.9495e-01, 2.1337e+01], + [2.7085e+01, 1.9028e+00, 2.1337e+01], + [2.8420e+01, 1.9028e+00, 1.8102e+01], + [2.9870e+01, 4.9495e-01, 1.8700e+01], + [2.8535e+01, 4.9495e-01, 2.1935e+01], + [2.8535e+01, 1.9028e+00, 2.1935e+01], + [2.9870e+01, 1.9028e+00, 1.8700e+01]], + [[1.4452e+01, 2.9959e-01, 2.7818e+01], + [1.7530e+01, 2.9959e-01, 3.0032e+01], + [1.7530e+01, 1.7879e+00, 3.0032e+01], + [1.4452e+01, 1.7879e+00, 2.7818e+01], + [1.5474e+01, 2.9959e-01, 2.6398e+01], + [1.8552e+01, 2.9959e-01, 2.8612e+01], + [1.8552e+01, 1.7879e+00, 2.8612e+01], + [1.5474e+01, 1.7879e+00, 2.6398e+01]]]) + + assert torch.allclose(boxes.corners, expected_tensor, rtol=1e-3, atol=1e-4) + + th_boxes = torch.tensor( + [[ + 28.29669987, -0.5557558, -1.30332506, 1.47000003, 2.23000002, + 1.48000002, -1.57000005 + ], + [ + 26.66901946, 21.82302134, -1.73605708, 1.55999994, 3.48000002, + 1.39999998, -1.69000006 + ], + [ + 31.31977974, 8.16214412, -1.62177875, 1.74000001, 3.76999998, + 1.48000002, 2.78999996 + ]], + dtype=torch.float32) # test init with a given origin boxes_origin_given = CameraInstance3DBoxes( @@ -948,17 +1130,17 @@ def test_boxes3d_overlaps(): # Test LiDAR boxes 3D overlaps boxes1_tensor = torch.tensor( - [[1.8, -2.5, -1.8, 1.75, 3.39, 1.65, 1.6615927], - [8.9, -2.5, -1.6, 1.54, 4.01, 1.57, 1.5215927], - [28.3, 0.5, -1.3, 1.47, 2.23, 1.48, 4.7115927], - [31.3, -8.2, -1.6, 1.74, 3.77, 1.48, 0.35]], + [[1.8, -2.5, -1.8, 1.75, 3.39, 1.65, -1.6615927], + [8.9, -2.5, -1.6, 1.54, 4.01, 1.57, -1.5215927], + [28.3, 0.5, -1.3, 1.47, 2.23, 1.48, -4.7115927], + [31.3, -8.2, -1.6, 1.74, 3.77, 1.48, -0.35]], device='cuda') boxes1 = LiDARInstance3DBoxes(boxes1_tensor) - boxes2_tensor = torch.tensor([[1.2, -3.0, -1.9, 1.8, 3.4, 1.7, 1.9], - [8.1, -2.9, -1.8, 1.5, 4.1, 1.6, 1.8], - [31.3, -8.2, -1.6, 1.74, 3.77, 1.48, 0.35], - [20.1, -28.5, -1.9, 1.6, 3.5, 1.4, 5.1]], + boxes2_tensor = torch.tensor([[1.2, -3.0, -1.9, 1.8, 3.4, 1.7, -1.9], + [8.1, -2.9, -1.8, 1.5, 4.1, 1.6, -1.8], + [31.3, -8.2, -1.6, 1.74, 3.77, 1.48, -0.35], + [20.1, -28.5, -1.9, 1.6, 3.5, 1.4, -5.1]], device='cuda') boxes2 = LiDARInstance3DBoxes(boxes2_tensor) @@ -1101,6 +1283,7 @@ def test_depth_boxes3d(): [-2.4016, -3.2521, 0.4426, 0.8234, 0.5325, 1.0099, -0.1215], [-2.5181, -2.5298, -0.4321, 0.8597, 0.6193, 1.0204, -0.0493], [-1.5434, -2.4951, -0.5570, 0.9385, 2.1404, 0.8954, -0.0585]]) + expected_tensor[:, -1:] -= 0.022998953275003075 * 2 points, rot_mat_T = boxes_rot.rotate(-0.022998953275003075, points) expected_points = torch.tensor([[-0.7049, -1.2400, -1.4658, 2.5359], [-0.9881, -4.7599, -1.3857, 0.7167], @@ -1115,10 +1298,13 @@ def test_depth_boxes3d(): assert torch.allclose(rot_mat_T, expected_rot_mat_T, 1e-3) # with input torch.Tensor points and rotation matrix - points, rot_mat_T = boxes.rotate(0.022998953275003075, points) # back + points, rot_mat_T = boxes.rotate(-0.022998953275003075, points) # back rot_mat = np.array([[0.99973554, 0.02299693, 0.], [-0.02299693, 0.99973554, 0.], [0., 0., 1.]]) points, rot_mat_T = boxes.rotate(rot_mat, points) + expected_rot_mat_T = torch.tensor([[0.99973554, 0.02299693, 0.0000], + [-0.02299693, 0.99973554, 0.0000], + [0.0000, 0.0000, 1.0000]]) assert torch.allclose(boxes_rot.tensor, expected_tensor, 1e-3) assert torch.allclose(points, expected_points, 1e-3) assert torch.allclose(rot_mat_T, expected_rot_mat_T, 1e-3) @@ -1135,27 +1321,64 @@ def test_depth_boxes3d(): [-0.0974, 6.7093, -0.9697, 0.5599], [0.5669, 0.6404, -0.5265, 1.0032], [-0.4302, 4.5981, -1.4741, 0.0556]]) - expected_rot_mat_T_np = np.array([[0.9997, -0.0230, 0.0000], - [0.0230, 0.9997, 0.0000], + expected_rot_mat_T_np = np.array([[0.99973554, -0.02299693, 0.0000], + [0.02299693, 0.99973554, 0.0000], [0.0000, 0.0000, 1.0000]]) expected_tensor = torch.tensor( [[-1.5434, -2.4951, -0.5570, 0.9385, 2.1404, 0.8954, -0.0585], [-2.4016, -3.2521, 0.4426, 0.8234, 0.5325, 1.0099, -0.1215], [-2.5181, -2.5298, -0.4321, 0.8597, 0.6193, 1.0204, -0.0493], [-1.5434, -2.4951, -0.5570, 0.9385, 2.1404, 0.8954, -0.0585]]) + expected_tensor[:, -1:] -= 0.022998953275003075 * 2 assert torch.allclose(boxes.tensor, expected_tensor, 1e-3) assert np.allclose(points_np, expected_points_np, 1e-3) assert np.allclose(rot_mat_T_np, expected_rot_mat_T_np, 1e-3) # with input DepthPoints and rotation matrix - points_np, rot_mat_T_np = boxes.rotate(0.022998953275003075, points_np) + points_np, rot_mat_T_np = boxes.rotate(-0.022998953275003075, points_np) depth_points = DepthPoints(points_np, points_dim=4) depth_points, rot_mat_T_np = boxes.rotate(rot_mat, depth_points) points_np = depth_points.tensor.numpy() + expected_rot_mat_T_np = expected_rot_mat_T_np.T assert torch.allclose(boxes.tensor, expected_tensor, 1e-3) assert np.allclose(points_np, expected_points_np, 1e-3) assert np.allclose(rot_mat_T_np, expected_rot_mat_T_np, 1e-3) + expected_tensor = torch.tensor([[[-2.1217, -3.5105, -0.5570], + [-2.1217, -3.5105, 0.3384], + [-1.8985, -1.3818, 0.3384], + [-1.8985, -1.3818, -0.5570], + [-1.1883, -3.6084, -0.5570], + [-1.1883, -3.6084, 0.3384], + [-0.9651, -1.4796, 0.3384], + [-0.9651, -1.4796, -0.5570]], + [[-2.8519, -3.4460, 0.4426], + [-2.8519, -3.4460, 1.4525], + [-2.7632, -2.9210, 1.4525], + [-2.7632, -2.9210, 0.4426], + [-2.0401, -3.5833, 0.4426], + [-2.0401, -3.5833, 1.4525], + [-1.9513, -3.0582, 1.4525], + [-1.9513, -3.0582, 0.4426]], + [[-2.9755, -2.7971, -0.4321], + [-2.9755, -2.7971, 0.5883], + [-2.9166, -2.1806, 0.5883], + [-2.9166, -2.1806, -0.4321], + [-2.1197, -2.8789, -0.4321], + [-2.1197, -2.8789, 0.5883], + [-2.0608, -2.2624, 0.5883], + [-2.0608, -2.2624, -0.4321]], + [[-2.1217, -3.5105, -0.5570], + [-2.1217, -3.5105, 0.3384], + [-1.8985, -1.3818, 0.3384], + [-1.8985, -1.3818, -0.5570], + [-1.1883, -3.6084, -0.5570], + [-1.1883, -3.6084, 0.3384], + [-0.9651, -1.4796, 0.3384], + [-0.9651, -1.4796, -0.5570]]]) + + assert torch.allclose(boxes.corners, expected_tensor, 1e-3) + th_boxes = torch.tensor( [[0.61211395, 0.8129094, 0.10563634, 1.497534, 0.16927195, 0.27956772], [1.430009, 0.49797538, 0.9382923, 0.07694054, 0.9312509, 1.8919173]], @@ -1182,6 +1405,11 @@ def test_depth_boxes3d(): mask = boxes.nonempty() assert (mask == expected_tensor).all() + # test bbox in_range + expected_tensor = torch.tensor([0, 1], dtype=torch.bool) + mask = boxes.in_range_3d([1, 0, -2, 2, 1, 5]) + assert (mask == expected_tensor).all() + expected_tensor = torch.tensor([[[-0.1030, 0.6649, 0.1056], [-0.1030, 0.6649, 0.3852], [-0.1030, 0.9029, 0.3852], @@ -1198,11 +1426,11 @@ def test_depth_boxes3d(): [1.5112, -0.0352, 2.8302], [1.5112, 0.8986, 2.8302], [1.5112, 0.8986, 0.9383]]]) - torch.allclose(boxes.corners, expected_tensor) + assert torch.allclose(boxes.corners, expected_tensor, 1e-3) # test points in boxes if torch.cuda.is_available(): - box_idxs_of_pts = boxes.points_in_boxes(points.cuda()) + box_idxs_of_pts = boxes.points_in_boxes_all(points.cuda()) expected_idxs_of_pts = torch.tensor( [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]], device='cuda:0', @@ -1211,8 +1439,8 @@ def test_depth_boxes3d(): # test get_surface_line_center boxes = torch.tensor( - [[0.3294, 1.0359, 0.1171, 1.0822, 1.1247, 1.3721, 0.4916], - [-2.4630, -2.6324, -0.1616, 0.9202, 1.7896, 0.1992, 0.3185]]) + [[0.3294, 1.0359, 0.1171, 1.0822, 1.1247, 1.3721, -0.4916], + [-2.4630, -2.6324, -0.1616, 0.9202, 1.7896, 0.1992, -0.3185]]) boxes = DepthInstance3DBoxes( boxes, box_dim=boxes.shape[-1], with_yaw=True, origin=(0.5, 0.5, 0.5)) surface_center, line_center = boxes.get_surface_line_center() @@ -1260,6 +1488,7 @@ def test_depth_boxes3d(): def test_rotation_3d_in_axis(): + # clockwise points = torch.tensor([[[-0.4599, -0.0471, 0.0000], [-0.4599, -0.0471, 1.8433], [-0.4599, 0.0471, 1.8433]], @@ -1267,15 +1496,115 @@ def test_rotation_3d_in_axis(): [-0.2555, -0.2683, 0.9072], [-0.2555, 0.2683, 0.9072]]]) rotated = rotation_3d_in_axis( - points, torch.tensor([-np.pi / 10, np.pi / 10]), axis=0) - expected_rotated = torch.tensor([[[0.0000, -0.4228, -0.1869], - [1.8433, -0.4228, -0.1869], - [1.8433, -0.4519, -0.0973]], - [[0.0000, -0.3259, -0.1762], - [0.9072, -0.3259, -0.1762], - [0.9072, -0.1601, 0.3341]]]) + points, + torch.tensor([-np.pi / 10, np.pi / 10]), + axis=0, + clockwise=True) + expected_rotated = torch.tensor( + [[[-0.4599, -0.0448, -0.0146], [-0.4599, -0.6144, 1.7385], + [-0.4599, -0.5248, 1.7676]], + [[-0.2555, -0.2552, 0.0829], [-0.2555, 0.0252, 0.9457], + [-0.2555, 0.5355, 0.7799]]], + dtype=torch.float32) + assert torch.allclose(rotated, expected_rotated, atol=1e-3) + + # anti-clockwise with return rotation mat + points = torch.tensor([[[-0.4599, -0.0471, 0.0000], + [-0.4599, -0.0471, 1.8433]]]) + rotated = rotation_3d_in_axis(points, torch.tensor([np.pi / 2]), axis=0) + expected_rotated = torch.tensor([[[-0.4599, 0.0000, -0.0471], + [-0.4599, -1.8433, -0.0471]]]) assert torch.allclose(rotated, expected_rotated, 1e-3) + points = torch.tensor([[[-0.4599, -0.0471, 0.0000], + [-0.4599, -0.0471, 1.8433]]]) + rotated, mat = rotation_3d_in_axis( + points, torch.tensor([np.pi / 2]), axis=0, return_mat=True) + expected_rotated = torch.tensor([[[-0.4599, 0.0000, -0.0471], + [-0.4599, -1.8433, -0.0471]]]) + expected_mat = torch.tensor([[[1, 0, 0], [0, 0, 1], [0, -1, 0]]]).float() + assert torch.allclose(rotated, expected_rotated, atol=1e-6) + assert torch.allclose(mat, expected_mat, atol=1e-6) + + points = torch.tensor([[[-0.4599, -0.0471, 0.0000], + [-0.4599, -0.0471, 1.8433]], + [[-0.2555, -0.2683, 0.0000], + [-0.2555, -0.2683, 0.9072]]]) + rotated = rotation_3d_in_axis(points, np.pi / 2, axis=0) + expected_rotated = torch.tensor([[[-0.4599, 0.0000, -0.0471], + [-0.4599, -1.8433, -0.0471]], + [[-0.2555, 0.0000, -0.2683], + [-0.2555, -0.9072, -0.2683]]]) + assert torch.allclose(rotated, expected_rotated, atol=1e-3) + + points = np.array([[[-0.4599, -0.0471, 0.0000], [-0.4599, -0.0471, + 1.8433]], + [[-0.2555, -0.2683, 0.0000], + [-0.2555, -0.2683, 0.9072]]]).astype(np.float32) + + rotated = rotation_3d_in_axis(points, np.pi / 2, axis=0) + expected_rotated = np.array([[[-0.4599, 0.0000, -0.0471], + [-0.4599, -1.8433, -0.0471]], + [[-0.2555, 0.0000, -0.2683], + [-0.2555, -0.9072, -0.2683]]]) + assert np.allclose(rotated, expected_rotated, atol=1e-3) + + points = torch.tensor([[[-0.4599, -0.0471, 0.0000], + [-0.4599, -0.0471, 1.8433]], + [[-0.2555, -0.2683, 0.0000], + [-0.2555, -0.2683, 0.9072]]]) + angles = [np.pi / 2, -np.pi / 2] + rotated = rotation_3d_in_axis(points, angles, axis=0).numpy() + expected_rotated = np.array([[[-0.4599, 0.0000, -0.0471], + [-0.4599, -1.8433, -0.0471]], + [[-0.2555, 0.0000, 0.2683], + [-0.2555, 0.9072, 0.2683]]]) + assert np.allclose(rotated, expected_rotated, atol=1e-3) + + points = torch.tensor([[[-0.4599, -0.0471, 0.0000], + [-0.4599, -0.0471, 1.8433]], + [[-0.2555, -0.2683, 0.0000], + [-0.2555, -0.2683, 0.9072]]]) + angles = [np.pi / 2, -np.pi / 2] + rotated = rotation_3d_in_axis(points, angles, axis=1).numpy() + expected_rotated = np.array([[[0.0000, -0.0471, 0.4599], + [1.8433, -0.0471, 0.4599]], + [[0.0000, -0.2683, -0.2555], + [-0.9072, -0.2683, -0.2555]]]) + assert np.allclose(rotated, expected_rotated, atol=1e-3) + + points = torch.tensor([[[-0.4599, -0.0471, 0.0000], + [-0.4599, 0.0471, 1.8433]], + [[-0.2555, -0.2683, 0.0000], + [0.2555, -0.2683, 0.9072]]]) + angles = [np.pi / 2, -np.pi / 2] + rotated = rotation_3d_in_axis(points, angles, axis=2).numpy() + expected_rotated = np.array([[[0.0471, -0.4599, 0.0000], + [-0.0471, -0.4599, 1.8433]], + [[-0.2683, 0.2555, 0.0000], + [-0.2683, -0.2555, 0.9072]]]) + assert np.allclose(rotated, expected_rotated, atol=1e-3) + + points = torch.tensor([[[-0.0471, 0.0000], [-0.0471, 1.8433]], + [[-0.2683, 0.0000], [-0.2683, 0.9072]]]) + angles = [np.pi / 2, -np.pi / 2] + rotated = rotation_3d_in_axis(points, angles) + expected_rotated = np.array([[[0.0000, -0.0471], [-1.8433, -0.0471]], + [[0.0000, 0.2683], [0.9072, 0.2683]]]) + assert np.allclose(rotated, expected_rotated, atol=1e-3) + + +def test_rotation_2d(): + angles = np.array([3.14]) + corners = np.array([[[-0.235, -0.49], [-0.235, 0.49], [0.235, 0.49], + [0.235, -0.49]]]) + corners_rotated = rotation_3d_in_axis(corners, angles) + expected_corners = np.array([[[0.2357801, 0.48962511], + [0.2342193, -0.49037365], + [-0.2357801, -0.48962511], + [-0.2342193, 0.49037365]]]) + assert np.allclose(corners_rotated, expected_corners) + def test_limit_period(): torch.manual_seed(0) @@ -1285,6 +1614,11 @@ def test_limit_period(): [0.3074]]) assert torch.allclose(result, expected_result, 1e-3) + val = val.numpy() + result = limit_period(val) + expected_result = expected_result.numpy() + assert np.allclose(result, expected_result, 1e-3) + def test_xywhr2xyxyr(): torch.manual_seed(0) @@ -1324,3 +1658,139 @@ def test_points_cam2img(): [0.6994, 0.7782], [0.5623, 0.6303], [0.4359, 0.6532]]) assert torch.allclose(point_2d_res, expected_point_2d_res, 1e-3) + + points = points.numpy() + proj_mat = proj_mat.numpy() + point_2d_res = points_cam2img(points, proj_mat) + expected_point_2d_res = expected_point_2d_res.numpy() + assert np.allclose(point_2d_res, expected_point_2d_res, 1e-3) + + points = torch.from_numpy(points) + point_2d_res = points_cam2img(points, proj_mat) + expected_point_2d_res = torch.from_numpy(expected_point_2d_res) + assert torch.allclose(point_2d_res, expected_point_2d_res, 1e-3) + + point_2d_res = points_cam2img(points, proj_mat, with_depth=True) + expected_point_2d_res = torch.tensor([[0.5832, 0.6496, 1.7577], + [0.6146, 0.7910, 1.5477], + [0.6994, 0.7782, 2.0091], + [0.5623, 0.6303, 1.8739], + [0.4359, 0.6532, 1.2056]]) + assert torch.allclose(point_2d_res, expected_point_2d_res, 1e-3) + + +def test_points_in_boxes(): + if not torch.cuda.is_available(): + pytest.skip('test requires GPU and torch+cuda') + lidar_pts = torch.tensor([[1.0, 4.3, 0.1], [1.0, 4.4, + 0.1], [1.1, 4.3, 0.1], + [0.9, 4.3, 0.1], [1.0, -0.3, 0.1], + [1.0, -0.4, 0.1], [2.9, 0.1, 6.0], + [-0.9, 3.9, 6.0]]).cuda() + lidar_boxes = torch.tensor([[1.0, 2.0, 0.0, 4.0, 4.0, 6.0, np.pi / 6], + [1.0, 2.0, 0.0, 4.0, 4.0, 6.0, np.pi / 2], + [1.0, 2.0, 0.0, 4.0, 4.0, 6.0, 7 * np.pi / 6], + [1.0, 2.0, 0.0, 4.0, 4.0, 6.0, -np.pi / 6]], + dtype=torch.float32).cuda() + lidar_boxes = LiDARInstance3DBoxes(lidar_boxes) + + point_indices = lidar_boxes.points_in_boxes_all(lidar_pts) + expected_point_indices = torch.tensor( + [[1, 0, 1, 1], [0, 0, 0, 0], [1, 0, 1, 0], [0, 0, 0, 1], [1, 0, 1, 1], + [0, 0, 0, 0], [0, 1, 0, 0], [0, 1, 0, 0]], + dtype=torch.int32).cuda() + assert point_indices.shape == torch.Size([8, 4]) + assert (point_indices == expected_point_indices).all() + + lidar_pts = torch.tensor([[1.0, 4.3, 0.1], [1.0, 4.4, + 0.1], [1.1, 4.3, 0.1], + [0.9, 4.3, 0.1], [1.0, -0.3, 0.1], + [1.0, -0.4, 0.1], [2.9, 0.1, 6.0], + [-0.9, 3.9, 6.0]]).cuda() + lidar_boxes = torch.tensor([[1.0, 2.0, 0.0, 4.0, 4.0, 6.0, np.pi / 6], + [1.0, 2.0, 0.0, 4.0, 4.0, 6.0, np.pi / 2], + [1.0, 2.0, 0.0, 4.0, 4.0, 6.0, 7 * np.pi / 6], + [1.0, 2.0, 0.0, 4.0, 4.0, 6.0, -np.pi / 6]], + dtype=torch.float32).cuda() + lidar_boxes = LiDARInstance3DBoxes(lidar_boxes) + + point_indices = lidar_boxes.points_in_boxes_part(lidar_pts) + expected_point_indices = torch.tensor([0, -1, 0, 3, 0, -1, 1, 1], + dtype=torch.int32).cuda() + assert point_indices.shape == torch.Size([8]) + assert (point_indices == expected_point_indices).all() + + depth_boxes = torch.tensor([[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3], + [-10.0, 23.0, 16.0, 10, 20, 20, 0.5]], + dtype=torch.float32).cuda() + depth_boxes = DepthInstance3DBoxes(depth_boxes) + depth_pts = torch.tensor( + [[[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6], + [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3], + [4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [ + -16, -18, 9 + ], [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]]], + dtype=torch.float32).cuda() + + point_indices = depth_boxes.points_in_boxes_all(depth_pts) + expected_point_indices = torch.tensor( + [[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [0, 1], [0, 0], [0, 0], + [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]], + dtype=torch.int32).cuda() + assert point_indices.shape == torch.Size([15, 2]) + assert (point_indices == expected_point_indices).all() + + point_indices = depth_boxes.points_in_boxes_part(depth_pts) + expected_point_indices = torch.tensor( + [0, 0, 0, 0, 0, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1], + dtype=torch.int32).cuda() + assert point_indices.shape == torch.Size([15]) + assert (point_indices == expected_point_indices).all() + + depth_boxes = torch.tensor([[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3], + [-10.0, 23.0, 16.0, 10, 20, 20, 0.5], + [1.0, 2.0, 0.0, 4.0, 4.0, 6.0, np.pi / 6], + [1.0, 2.0, 0.0, 4.0, 4.0, 6.0, np.pi / 2], + [1.0, 2.0, 0.0, 4.0, 4.0, 6.0, 7 * np.pi / 6], + [1.0, 2.0, 0.0, 4.0, 4.0, 6.0, -np.pi / 6]], + dtype=torch.float32).cuda() + cam_boxes = DepthInstance3DBoxes(depth_boxes).convert_to(Box3DMode.CAM) + depth_pts = torch.tensor( + [[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6], + [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3], + [4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [-16, -18, 9], + [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4], [1.0, 4.3, 0.1], + [1.0, 4.4, 0.1], [1.1, 4.3, 0.1], [0.9, 4.3, 0.1], [1.0, -0.3, 0.1], + [1.0, -0.4, 0.1], [2.9, 0.1, 6.0], [-0.9, 3.9, 6.0]], + dtype=torch.float32).cuda() + + cam_pts = DepthPoints(depth_pts).convert_to(Coord3DMode.CAM).tensor + + point_indices = cam_boxes.points_in_boxes_all(cam_pts) + expected_point_indices = torch.tensor( + [[1, 0, 1, 1, 1, 1], [1, 0, 1, 1, 1, 1], [1, 0, 1, 1, 1, 1], + [1, 0, 1, 1, 1, 1], [1, 0, 1, 1, 1, 1], [0, 1, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], + [0, 0, 0, 1, 0, 1], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], + [0, 0, 1, 1, 1, 1], [0, 0, 0, 1, 0, 0], [0, 0, 0, 1, 0, 1], + [0, 0, 1, 1, 1, 0], [0, 0, 1, 1, 1, 1], [0, 0, 0, 1, 0, 0], + [1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0]], + dtype=torch.int32).cuda() + assert point_indices.shape == torch.Size([23, 6]) + assert (point_indices == expected_point_indices).all() + + point_indices = cam_boxes.points_in_boxes_batch(cam_pts) + assert (point_indices == expected_point_indices).all() + + point_indices = cam_boxes.points_in_boxes_part(cam_pts) + expected_point_indices = torch.tensor([ + 0, 0, 0, 0, 0, 1, -1, -1, -1, -1, -1, -1, 3, -1, -1, 2, 3, 3, 2, 2, 3, + 0, 0 + ], + dtype=torch.int32).cuda() + assert point_indices.shape == torch.Size([23]) + assert (point_indices == expected_point_indices).all() + + point_indices = cam_boxes.points_in_boxes(cam_pts) + assert (point_indices == expected_point_indices).all() diff --git a/tests/test_utils/test_box_np_ops.py b/tests/test_utils/test_box_np_ops.py index 77924b1c03..1c6275de52 100644 --- a/tests/test_utils/test_box_np_ops.py +++ b/tests/test_utils/test_box_np_ops.py @@ -20,7 +20,7 @@ def test_camera_to_lidar(): def test_box_camera_to_lidar(): from mmdet3d.core.bbox.box_np_ops import box_camera_to_lidar - box = np.array([[1.84, 1.47, 8.41, 1.2, 1.89, 0.48, 0.01]]) + box = np.array([[1.84, 1.47, 8.41, 1.2, 1.89, 0.48, -0.01]]) rect = np.array([[0.9999128, 0.01009263, -0.00851193, 0.], [-0.01012729, 0.9999406, -0.00403767, 0.], [0.00847068, 0.00412352, 0.9999556, 0.], [0., 0., 0., @@ -30,8 +30,9 @@ def test_box_camera_to_lidar(): [0.9999753, 0.00693114, -0.0011439, -0.3321029], [0., 0., 0., 1.]]) box_lidar = box_camera_to_lidar(box, rect, Trv2c) - expected_box = np.array( - [[8.73138192, -1.85591746, -1.59969933, 0.48, 1.2, 1.89, 0.01]]) + expected_box = np.array([[ + 8.73138192, -1.85591746, -1.59969933, 1.2, 0.48, 1.89, 0.01 - np.pi / 2 + ]]) assert np.allclose(box_lidar, expected_box) @@ -48,22 +49,35 @@ def test_center_to_corner_box2d(): from mmdet3d.core.bbox.box_np_ops import center_to_corner_box2d center = np.array([[9.348705, -3.6271024]]) dims = np.array([[0.47, 0.98]]) - angles = np.array([-3.14]) + angles = np.array([3.14]) corner = center_to_corner_box2d(center, dims, angles) expected_corner = np.array([[[9.584485, -3.1374772], [9.582925, -4.117476], [9.112926, -4.1167274], [9.114486, -3.1367288]]]) assert np.allclose(corner, expected_corner) + center = np.array([[-0.0, 0.0]]) + dims = np.array([[4.0, 8.0]]) + angles = np.array([-0.785398]) # -45 degrees + corner = center_to_corner_box2d(center, dims, angles) + expected_corner = np.array([[[-4.24264, -1.41421], [1.41421, 4.24264], + [4.24264, 1.41421], [-1.41421, -4.24264]]]) + assert np.allclose(corner, expected_corner) + + +def test_points_in_convex_polygon_jit(): + from mmdet3d.core.bbox.box_np_ops import points_in_convex_polygon_jit + points = np.array([[0.4, 0.4], [0.5, 0.5], [0.6, 0.6]]) + polygons = np.array([[[1.0, 0.0], [0.0, 1.0], [0.0, 0.5], [0.0, 0.0]], + [[1.0, 0.0], [1.0, 1.0], [0.5, 1.0], [0.0, 1.0]], + [[1.0, 0.0], [0.0, 1.0], [-1.0, 0.0], [0.0, -1.0]]]) + res = points_in_convex_polygon_jit(points, polygons) + expected_res = np.array([[1, 0, 1], [0, 0, 0], [0, 1, 0]]).astype(np.bool) + assert np.allclose(res, expected_res) -def test_rotation_2d(): - from mmdet3d.core.bbox.box_np_ops import rotation_2d - angles = np.array([-3.14]) - corners = np.array([[[-0.235, -0.49], [-0.235, 0.49], [0.235, 0.49], - [0.235, -0.49]]]) - corners_rotated = rotation_2d(corners, angles) - expected_corners = np.array([[[0.2357801, 0.48962511], - [0.2342193, -0.49037365], - [-0.2357801, -0.48962511], - [-0.2342193, 0.49037365]]]) - assert np.allclose(corners_rotated, expected_corners) + polygons = np.array([[[0.0, 0.0], [0.0, 1.0], [0.5, 0.5], [1.0, 0.0]], + [[0.0, 1.0], [1.0, 1.0], [1.0, 0.5], [1.0, 0.0]], + [[1.0, 0.0], [0.0, -1.0], [-1.0, 0.0], [0.0, 1.1]]]) + res = points_in_convex_polygon_jit(points, polygons, clockwise=True) + expected_res = np.array([[1, 0, 1], [0, 0, 1], [0, 1, 0]]).astype(np.bool) + assert np.allclose(res, expected_res) diff --git a/tests/test_utils/test_coord_3d_mode.py b/tests/test_utils/test_coord_3d_mode.py index 68703431d2..24f0e192c8 100644 --- a/tests/test_utils/test_coord_3d_mode.py +++ b/tests/test_utils/test_coord_3d_mode.py @@ -3,7 +3,8 @@ import torch from mmdet3d.core.bbox import (CameraInstance3DBoxes, Coord3DMode, - DepthInstance3DBoxes, LiDARInstance3DBoxes) + DepthInstance3DBoxes, LiDARInstance3DBoxes, + limit_period) from mmdet3d.core.points import CameraPoints, DepthPoints, LiDARPoints @@ -242,22 +243,31 @@ def test_boxes_conversion(): convert_lidar_boxes = Coord3DMode.convert(cam_boxes, Coord3DMode.CAM, Coord3DMode.LIDAR) - expected_tensor = torch.tensor( - [[-1.7501, -1.7802, -2.5162, 1.6500, 1.7500, 3.3900, 1.4800], - [-1.6357, -8.9594, -2.4567, 1.5700, 1.5400, 4.0100, 1.6200], - [-1.3033, -28.2967, 0.5558, 1.4800, 1.4700, 2.2300, -1.5700], - [-1.7361, -26.6690, -21.8230, 1.4000, 1.5600, 3.4800, -1.6900], - [-1.6218, -31.3198, -8.1621, 1.4800, 1.7400, 3.7700, 2.7900]]) + expected_tensor = torch.tensor([[ + -1.7501, -1.7802, -2.5162, 1.7500, 1.6500, 3.3900, -1.4800 - np.pi / 2 + ], [ + -1.6357, -8.9594, -2.4567, 1.5400, 1.5700, 4.0100, -1.6200 - np.pi / 2 + ], [-1.3033, -28.2967, 0.5558, 1.4700, 1.4800, 2.2300, 1.5700 - np.pi / 2], + [ + -1.7361, -26.6690, -21.8230, 1.5600, + 1.4000, 3.4800, 1.6900 - np.pi / 2 + ], + [ + -1.6218, -31.3198, -8.1621, 1.7400, + 1.4800, 3.7700, -2.7900 - np.pi / 2 + ]]) + expected_tensor[:, -1:] = limit_period( + expected_tensor[:, -1:], period=np.pi * 2) assert torch.allclose(expected_tensor, convert_lidar_boxes.tensor, 1e-3) convert_depth_boxes = Coord3DMode.convert(cam_boxes, Coord3DMode.CAM, Coord3DMode.DEPTH) expected_tensor = torch.tensor( - [[1.7802, 1.7501, 2.5162, 1.7500, 1.6500, 3.3900, 1.4800], - [8.9594, 1.6357, 2.4567, 1.5400, 1.5700, 4.0100, 1.6200], - [28.2967, 1.3033, -0.5558, 1.4700, 1.4800, 2.2300, -1.5700], - [26.6690, 1.7361, 21.8230, 1.5600, 1.4000, 3.4800, -1.6900], - [31.3198, 1.6218, 8.1621, 1.7400, 1.4800, 3.7700, 2.7900]]) + [[1.7802, -1.7501, -2.5162, 1.7500, 1.6500, 3.3900, -1.4800], + [8.9594, -1.6357, -2.4567, 1.5400, 1.5700, 4.0100, -1.6200], + [28.2967, -1.3033, 0.5558, 1.4700, 1.4800, 2.2300, 1.5700], + [26.6690, -1.7361, -21.8230, 1.5600, 1.4000, 3.4800, 1.6900], + [31.3198, -1.6218, -8.1621, 1.7400, 1.4800, 3.7700, -2.7900]]) assert torch.allclose(expected_tensor, convert_depth_boxes.tensor, 1e-3) # test LIDAR to CAM and DEPTH @@ -269,22 +279,42 @@ def test_boxes_conversion(): [31.31978, 8.162144, -1.6217787, 1.74, 3.77, 1.48, 2.79]]) convert_cam_boxes = Coord3DMode.convert(lidar_boxes, Coord3DMode.LIDAR, Coord3DMode.CAM) - expected_tensor = torch.tensor( - [[-2.5162, 1.7501, 1.7802, 3.3900, 1.6500, 1.7500, 1.4800], - [-2.4567, 1.6357, 8.9594, 4.0100, 1.5700, 1.5400, 1.6200], - [0.5558, 1.3033, 28.2967, 2.2300, 1.4800, 1.4700, -1.5700], - [-21.8230, 1.7361, 26.6690, 3.4800, 1.4000, 1.5600, -1.6900], - [-8.1621, 1.6218, 31.3198, 3.7700, 1.4800, 1.7400, 2.7900]]) + expected_tensor = torch.tensor([ + [-2.5162, 1.7501, 1.7802, 1.7500, 1.6500, 3.3900, -1.4800 - np.pi / 2], + [-2.4567, 1.6357, 8.9594, 1.5400, 1.5700, 4.0100, -1.6200 - np.pi / 2], + [0.5558, 1.3033, 28.2967, 1.4700, 1.4800, 2.2300, 1.5700 - np.pi / 2], + [ + -21.8230, 1.7361, 26.6690, 1.5600, 1.4000, 3.4800, + 1.6900 - np.pi / 2 + ], + [ + -8.1621, 1.6218, 31.3198, 1.7400, 1.4800, 3.7700, + -2.7900 - np.pi / 2 + ] + ]) + expected_tensor[:, -1:] = limit_period( + expected_tensor[:, -1:], period=np.pi * 2) assert torch.allclose(expected_tensor, convert_cam_boxes.tensor, 1e-3) convert_depth_boxes = Coord3DMode.convert(lidar_boxes, Coord3DMode.LIDAR, Coord3DMode.DEPTH) - expected_tensor = torch.tensor( - [[-2.5162, 1.7802, -1.7501, 3.3900, 1.7500, 1.6500, 1.4800], - [-2.4567, 8.9594, -1.6357, 4.0100, 1.5400, 1.5700, 1.6200], - [0.5558, 28.2967, -1.3033, 2.2300, 1.4700, 1.4800, -1.5700], - [-21.8230, 26.6690, -1.7361, 3.4800, 1.5600, 1.4000, -1.6900], - [-8.1621, 31.3198, -1.6218, 3.7700, 1.7400, 1.4800, 2.7900]]) + expected_tensor = torch.tensor([[ + -2.5162, 1.7802, -1.7501, 1.7500, 3.3900, 1.6500, 1.4800 + np.pi / 2 + ], [-2.4567, 8.9594, -1.6357, 1.5400, 4.0100, 1.5700, 1.6200 + np.pi / 2], + [ + 0.5558, 28.2967, -1.3033, 1.4700, + 2.2300, 1.4800, -1.5700 + np.pi / 2 + ], + [ + -21.8230, 26.6690, -1.7361, 1.5600, + 3.4800, 1.4000, -1.6900 + np.pi / 2 + ], + [ + -8.1621, 31.3198, -1.6218, 1.7400, + 3.7700, 1.4800, 2.7900 + np.pi / 2 + ]]) + expected_tensor[:, -1:] = limit_period( + expected_tensor[:, -1:], period=np.pi * 2) assert torch.allclose(expected_tensor, convert_depth_boxes.tensor, 1e-3) # test DEPTH to CAM and LIDAR @@ -297,19 +327,25 @@ def test_boxes_conversion(): convert_cam_boxes = Coord3DMode.convert(depth_boxes, Coord3DMode.DEPTH, Coord3DMode.CAM) expected_tensor = torch.tensor( - [[1.7802, -1.7501, -2.5162, 1.7500, 1.6500, 3.3900, 1.4800], - [8.9594, -1.6357, -2.4567, 1.5400, 1.5700, 4.0100, 1.6200], - [28.2967, -1.3033, 0.5558, 1.4700, 1.4800, 2.2300, -1.5700], - [26.6690, -1.7361, -21.8230, 1.5600, 1.4000, 3.4800, -1.6900], - [31.3198, -1.6218, -8.1621, 1.7400, 1.4800, 3.7700, 2.7900]]) + [[1.7802, 1.7501, 2.5162, 1.7500, 1.6500, 3.3900, -1.4800], + [8.9594, 1.6357, 2.4567, 1.5400, 1.5700, 4.0100, -1.6200], + [28.2967, 1.3033, -0.5558, 1.4700, 1.4800, 2.2300, 1.5700], + [26.6690, 1.7361, 21.8230, 1.5600, 1.4000, 3.4800, 1.6900], + [31.3198, 1.6218, 8.1621, 1.7400, 1.4800, 3.7700, -2.7900]]) assert torch.allclose(expected_tensor, convert_cam_boxes.tensor, 1e-3) convert_lidar_boxes = Coord3DMode.convert(depth_boxes, Coord3DMode.DEPTH, Coord3DMode.LIDAR) - expected_tensor = torch.tensor( - [[2.5162, -1.7802, -1.7501, 3.3900, 1.7500, 1.6500, 1.4800], - [2.4567, -8.9594, -1.6357, 4.0100, 1.5400, 1.5700, 1.6200], - [-0.5558, -28.2967, -1.3033, 2.2300, 1.4700, 1.4800, -1.5700], - [21.8230, -26.6690, -1.7361, 3.4800, 1.5600, 1.4000, -1.6900], - [8.1621, -31.3198, -1.6218, 3.7700, 1.7400, 1.4800, 2.7900]]) + expected_tensor = torch.tensor([[ + 2.5162, -1.7802, -1.7501, 1.7500, 3.3900, 1.6500, 1.4800 - np.pi / 2 + ], [ + 2.4567, -8.9594, -1.6357, 1.5400, 4.0100, 1.5700, 1.6200 - np.pi / 2 + ], [ + -0.5558, -28.2967, -1.3033, 1.4700, 2.2300, 1.4800, -1.5700 - np.pi / 2 + ], [ + 21.8230, -26.6690, -1.7361, 1.5600, 3.4800, 1.4000, -1.6900 - np.pi / 2 + ], [8.1621, -31.3198, -1.6218, 1.7400, 3.7700, 1.4800, + 2.7900 - np.pi / 2]]) + expected_tensor[:, -1:] = limit_period( + expected_tensor[:, -1:], period=np.pi * 2) assert torch.allclose(expected_tensor, convert_lidar_boxes.tensor, 1e-3) diff --git a/tests/test_utils/test_points.py b/tests/test_utils/test_points.py index 5376ffe8bd..20af27fc24 100644 --- a/tests/test_utils/test_points.py +++ b/tests/test_utils/test_points.py @@ -66,6 +66,7 @@ def test_base_points(): ]]) assert torch.allclose(expected_tensor, base_points.tensor) + assert torch.allclose(expected_tensor[:, :2], base_points.bev) assert torch.allclose(expected_tensor[:, :3], base_points.coord) assert torch.allclose(expected_tensor[:, 3:6], base_points.color) assert torch.allclose(expected_tensor[:, 6], base_points.height) @@ -327,6 +328,7 @@ def test_cam_points(): ]]) assert torch.allclose(expected_tensor, cam_points.tensor) + assert torch.allclose(expected_tensor[:, [0, 2]], cam_points.bev) assert torch.allclose(expected_tensor[:, :3], cam_points.coord) assert torch.allclose(expected_tensor[:, 3:6], cam_points.color) assert torch.allclose(expected_tensor[:, 6], cam_points.height) @@ -603,6 +605,7 @@ def test_lidar_points(): ]]) assert torch.allclose(expected_tensor, lidar_points.tensor) + assert torch.allclose(expected_tensor[:, :2], lidar_points.bev) assert torch.allclose(expected_tensor[:, :3], lidar_points.coord) assert torch.allclose(expected_tensor[:, 3:6], lidar_points.color) assert torch.allclose(expected_tensor[:, 6], lidar_points.height) @@ -879,6 +882,7 @@ def test_depth_points(): ]]) assert torch.allclose(expected_tensor, depth_points.tensor) + assert torch.allclose(expected_tensor[:, :2], depth_points.bev) assert torch.allclose(expected_tensor[:, :3], depth_points.coord) assert torch.allclose(expected_tensor[:, 3:6], depth_points.color) assert torch.allclose(expected_tensor[:, 6], depth_points.height) diff --git a/tests/test_utils/test_setup_env.py b/tests/test_utils/test_setup_env.py new file mode 100644 index 0000000000..0c070c9f0e --- /dev/null +++ b/tests/test_utils/test_setup_env.py @@ -0,0 +1,68 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import multiprocessing as mp +import os +import platform + +import cv2 +from mmcv import Config + +from mmdet3d.utils import setup_multi_processes + + +def test_setup_multi_processes(): + # temp save system setting + sys_start_mehod = mp.get_start_method(allow_none=True) + sys_cv_threads = cv2.getNumThreads() + # pop and temp save system env vars + sys_omp_threads = os.environ.pop('OMP_NUM_THREADS', default=None) + sys_mkl_threads = os.environ.pop('MKL_NUM_THREADS', default=None) + + # test config without setting env + config = dict(data=dict(workers_per_gpu=2)) + cfg = Config(config) + setup_multi_processes(cfg) + assert os.getenv('OMP_NUM_THREADS') == '1' + assert os.getenv('MKL_NUM_THREADS') == '1' + # when set to 0, the num threads will be 1 + assert cv2.getNumThreads() == 1 + if platform.system() != 'Windows': + assert mp.get_start_method() == 'fork' + + # test num workers <= 1 + os.environ.pop('OMP_NUM_THREADS') + os.environ.pop('MKL_NUM_THREADS') + config = dict(data=dict(workers_per_gpu=0)) + cfg = Config(config) + setup_multi_processes(cfg) + assert 'OMP_NUM_THREADS' not in os.environ + assert 'MKL_NUM_THREADS' not in os.environ + + # test manually set env var + os.environ['OMP_NUM_THREADS'] = '4' + config = dict(data=dict(workers_per_gpu=2)) + cfg = Config(config) + setup_multi_processes(cfg) + assert os.getenv('OMP_NUM_THREADS') == '4' + + # test manually set opencv threads and mp start method + config = dict( + data=dict(workers_per_gpu=2), + opencv_num_threads=4, + mp_start_method='spawn') + cfg = Config(config) + setup_multi_processes(cfg) + assert cv2.getNumThreads() == 4 + assert mp.get_start_method() == 'spawn' + + # revert setting to avoid affecting other programs + if sys_start_mehod: + mp.set_start_method(sys_start_mehod, force=True) + cv2.setNumThreads(sys_cv_threads) + if sys_omp_threads: + os.environ['OMP_NUM_THREADS'] = sys_omp_threads + else: + os.environ.pop('OMP_NUM_THREADS') + if sys_mkl_threads: + os.environ['MKL_NUM_THREADS'] = sys_mkl_threads + else: + os.environ.pop('MKL_NUM_THREADS') diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py index 2283efb329..c68e43a36d 100644 --- a/tests/test_utils/test_utils.py +++ b/tests/test_utils/test_utils.py @@ -1,7 +1,12 @@ # Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import pytest import torch -from mmdet3d.core import draw_heatmap_gaussian +from mmdet3d.core import array_converter, draw_heatmap_gaussian, points_img2cam +from mmdet3d.core.bbox import CameraInstance3DBoxes +from mmdet3d.models.utils import (filter_outside_objs, get_edge_indices, + get_keypoints, handle_proj_objs) def test_gaussian(): @@ -10,3 +15,274 @@ def test_gaussian(): radius = 2 draw_heatmap_gaussian(heatmap, ct_int, radius) assert torch.isclose(torch.sum(heatmap), torch.tensor(4.3505), atol=1e-3) + + +def test_array_converter(): + # to torch + @array_converter(to_torch=True, apply_to=('array_a', 'array_b')) + def test_func_1(array_a, array_b, container): + container.append(array_a) + container.append(array_b) + return array_a.clone(), array_b.clone() + + np_array_a = np.array([0.0]) + np_array_b = np.array([0.0]) + container = [] + new_array_a, new_array_b = test_func_1(np_array_a, np_array_b, container) + + assert isinstance(new_array_a, np.ndarray) + assert isinstance(new_array_b, np.ndarray) + assert isinstance(container[0], torch.Tensor) + assert isinstance(container[1], torch.Tensor) + + # one to torch and one not + @array_converter(to_torch=True, apply_to=('array_a', )) + def test_func_2(array_a, array_b): + return torch.cat([array_a, array_b]) + + with pytest.raises(TypeError): + _ = test_func_2(np_array_a, np_array_b) + + # wrong template_arg_name_ + @array_converter( + to_torch=True, apply_to=('array_a', ), template_arg_name_='array_c') + def test_func_3(array_a, array_b): + return torch.cat([array_a, array_b]) + + with pytest.raises(ValueError): + _ = test_func_3(np_array_a, np_array_b) + + # wrong apply_to + @array_converter(to_torch=True, apply_to=('array_a', 'array_c')) + def test_func_4(array_a, array_b): + return torch.cat([array_a, array_b]) + + with pytest.raises(ValueError): + _ = test_func_4(np_array_a, np_array_b) + + # to numpy + @array_converter(to_torch=False, apply_to=('array_a', 'array_b')) + def test_func_5(array_a, array_b, container): + container.append(array_a) + container.append(array_b) + return array_a.copy(), array_b.copy() + + pt_array_a = torch.tensor([0.0]) + pt_array_b = torch.tensor([0.0]) + container = [] + new_array_a, new_array_b = test_func_5(pt_array_a, pt_array_b, container) + + assert isinstance(container[0], np.ndarray) + assert isinstance(container[1], np.ndarray) + assert isinstance(new_array_a, torch.Tensor) + assert isinstance(new_array_b, torch.Tensor) + + # apply_to = None + @array_converter(to_torch=False) + def test_func_6(array_a, array_b, container): + container.append(array_a) + container.append(array_b) + return array_a.clone(), array_b.clone() + + container = [] + new_array_a, new_array_b = test_func_6(pt_array_a, pt_array_b, container) + + assert isinstance(container[0], torch.Tensor) + assert isinstance(container[1], torch.Tensor) + assert isinstance(new_array_a, torch.Tensor) + assert isinstance(new_array_b, torch.Tensor) + + # with default arg + @array_converter(to_torch=True, apply_to=('array_a', 'array_b')) + def test_func_7(array_a, container, array_b=np.array([2.])): + container.append(array_a) + container.append(array_b) + return array_a.clone(), array_b.clone() + + container = [] + new_array_a, new_array_b = test_func_7(np_array_a, container) + + assert isinstance(container[0], torch.Tensor) + assert isinstance(container[1], torch.Tensor) + assert isinstance(new_array_a, np.ndarray) + assert isinstance(new_array_b, np.ndarray) + assert np.allclose(new_array_b, np.array([2.]), 1e-3) + + # override default arg + + container = [] + new_array_a, new_array_b = test_func_7(np_array_a, container, + np.array([4.])) + + assert isinstance(container[0], torch.Tensor) + assert isinstance(container[1], torch.Tensor) + assert isinstance(new_array_a, np.ndarray) + assert np.allclose(new_array_b, np.array([4.]), 1e-3) + + # list arg + @array_converter(to_torch=True, apply_to=('array_a', 'array_b')) + def test_func_8(container, array_a, array_b=[2.]): + container.append(array_a) + container.append(array_b) + return array_a.clone(), array_b.clone() + + container = [] + new_array_a, new_array_b = test_func_8(container, [3.]) + + assert isinstance(container[0], torch.Tensor) + assert isinstance(container[1], torch.Tensor) + assert np.allclose(new_array_a, np.array([3.]), 1e-3) + assert np.allclose(new_array_b, np.array([2.]), 1e-3) + + # number arg + @array_converter(to_torch=True, apply_to=('array_a', 'array_b')) + def test_func_9(container, array_a, array_b=1): + container.append(array_a) + container.append(array_b) + return array_a.clone(), array_b.clone() + + container = [] + new_array_a, new_array_b = test_func_9(container, np_array_a) + + assert isinstance(container[0], torch.FloatTensor) + assert isinstance(container[1], torch.FloatTensor) + assert np.allclose(new_array_a, np_array_a, 1e-3) + assert np.allclose(new_array_b, np.array(1.0), 1e-3) + + # feed kwargs + container = [] + kwargs = {'array_a': [5.], 'array_b': [6.]} + new_array_a, new_array_b = test_func_8(container, **kwargs) + + assert isinstance(container[0], torch.Tensor) + assert isinstance(container[1], torch.Tensor) + assert np.allclose(new_array_a, np.array([5.]), 1e-3) + assert np.allclose(new_array_b, np.array([6.]), 1e-3) + + # feed args and kwargs + container = [] + kwargs = {'array_b': [7.]} + args = (container, [8.]) + new_array_a, new_array_b = test_func_8(*args, **kwargs) + + assert isinstance(container[0], torch.Tensor) + assert isinstance(container[1], torch.Tensor) + assert np.allclose(new_array_a, np.array([8.]), 1e-3) + assert np.allclose(new_array_b, np.array([7.]), 1e-3) + + # wrong template arg type + with pytest.raises(TypeError): + new_array_a, new_array_b = test_func_9(container, 3 + 4j) + + with pytest.raises(TypeError): + new_array_a, new_array_b = test_func_9(container, {}) + + # invalid template arg list + with pytest.raises(TypeError): + new_array_a, new_array_b = test_func_9(container, + [True, np.array([3.0])]) + + +def test_points_img2cam(): + points = torch.tensor([[0.5764, 0.9109, 0.7576], [0.6656, 0.5498, 0.9813]]) + cam2img = torch.tensor([[700., 0., 450., 0.], [0., 700., 200., 0.], + [0., 0., 1., 0.]]) + xyzs = points_img2cam(points, cam2img) + expected_xyzs = torch.tensor([[-0.4864, -0.2155, 0.7576], + [-0.6299, -0.2796, 0.9813]]) + assert torch.allclose(xyzs, expected_xyzs, atol=1e-3) + + +def test_generate_edge_indices(): + + input_metas = [ + dict(img_shape=(110, 110), pad_shape=(128, 128)), + dict(img_shape=(98, 110), pad_shape=(128, 128)) + ] + downsample_ratio = 4 + edge_indices_list = get_edge_indices(input_metas, downsample_ratio) + + assert edge_indices_list[0].shape[0] == 108 + assert edge_indices_list[1].shape[0] == 102 + + +def test_truncation_hanlde(): + + centers2d_list = [ + torch.tensor([[-99.86, 199.45], [499.50, 399.20], [201.20, 99.86]]) + ] + + gt_bboxes_list = [ + torch.tensor([[0.25, 99.8, 99.8, 199.6], [300.2, 250.1, 399.8, 299.6], + [100.2, 20.1, 300.8, 180.7]]) + ] + img_metas = [dict(img_shape=[300, 400])] + centers2d_target_list, offsets2d_list, trunc_mask_list = \ + handle_proj_objs(centers2d_list, gt_bboxes_list, img_metas) + + centers2d_target = torch.tensor([[0., 166.30435501], [379.03437877, 299.], + [201.2, 99.86]]) + + offsets2d = torch.tensor([[-99.86, 33.45], [120.5, 100.2], [0.2, -0.14]]) + trunc_mask = torch.tensor([True, True, False]) + + assert torch.allclose(centers2d_target_list[0], centers2d_target) + assert torch.allclose(offsets2d_list[0], offsets2d, atol=1e-4) + assert torch.all(trunc_mask_list[0] == trunc_mask) + assert torch.allclose( + centers2d_target_list[0].round().int() + offsets2d_list[0], + centers2d_list[0]) + + +def test_filter_outside_objs(): + + centers2d_list = [ + torch.tensor([[-99.86, 199.45], [499.50, 399.20], [201.20, 99.86]]), + torch.tensor([[-47.86, 199.45], [410.50, 399.20], [401.20, 349.86]]) + ] + gt_bboxes_list = [ + torch.rand([3, 4], dtype=torch.float32), + torch.rand([3, 4], dtype=torch.float32) + ] + gt_bboxes_3d_list = [ + CameraInstance3DBoxes(torch.rand([3, 7]), box_dim=7), + CameraInstance3DBoxes(torch.rand([3, 7]), box_dim=7) + ] + gt_labels_list = [torch.tensor([0, 1, 2]), torch.tensor([2, 0, 0])] + gt_labels_3d_list = [torch.tensor([0, 1, 2]), torch.tensor([2, 0, 0])] + img_metas = [dict(img_shape=[300, 400]), dict(img_shape=[500, 450])] + filter_outside_objs(gt_bboxes_list, gt_labels_list, gt_bboxes_3d_list, + gt_labels_3d_list, centers2d_list, img_metas) + + assert len(centers2d_list[0]) == len(gt_bboxes_3d_list[0]) == \ + len(gt_bboxes_list[0]) == len(gt_labels_3d_list[0]) == \ + len(gt_labels_list[0]) == 1 + + assert len(centers2d_list[1]) == len(gt_bboxes_3d_list[1]) == \ + len(gt_bboxes_list[1]) == len(gt_labels_3d_list[1]) == \ + len(gt_labels_list[1]) == 2 + + +def test_generate_keypoints(): + + centers2d_list = [ + torch.tensor([[-99.86, 199.45], [499.50, 399.20], [201.20, 99.86]]), + torch.tensor([[-47.86, 199.45], [410.50, 399.20], [401.20, 349.86]]) + ] + gt_bboxes_3d_list = [ + CameraInstance3DBoxes(torch.rand([3, 7])), + CameraInstance3DBoxes(torch.rand([3, 7])) + ] + img_metas = [ + dict( + cam2img=[[1260.8474446004698, 0.0, 807.968244525554, 40.1111], + [0.0, 1260.8474446004698, 495.3344268742088, 2.34422], + [0.0, 0.0, 1.0, 0.00333333], [0.0, 0.0, 0.0, 1.0]], + img_shape=(300, 400)) for i in range(2) + ] + + keypoints2d_list, keypoints_depth_mask_list = \ + get_keypoints(gt_bboxes_3d_list, centers2d_list, img_metas) + + assert keypoints2d_list[0].shape == (3, 10, 3) + assert keypoints_depth_mask_list[0].shape == (3, 3) diff --git a/tools/analysis_tools/analyze_logs.py b/tools/analysis_tools/analyze_logs.py index 806175f34c..18858466e6 100644 --- a/tools/analysis_tools/analyze_logs.py +++ b/tools/analysis_tools/analyze_logs.py @@ -1,9 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. import argparse import json +from collections import defaultdict + import numpy as np import seaborn as sns -from collections import defaultdict from matplotlib import pyplot as plt diff --git a/tools/analysis_tools/benchmark.py b/tools/analysis_tools/benchmark.py index 17c9dd35f1..b31c9f0958 100644 --- a/tools/analysis_tools/benchmark.py +++ b/tools/analysis_tools/benchmark.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import argparse import time + import torch from mmcv import Config from mmcv.parallel import MMDataParallel diff --git a/tools/analysis_tools/get_flops.py b/tools/analysis_tools/get_flops.py index bb66c647ec..f45ed80f88 100644 --- a/tools/analysis_tools/get_flops.py +++ b/tools/analysis_tools/get_flops.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import argparse + import torch from mmcv import Config, DictAction diff --git a/tools/create_data.py b/tools/create_data.py index 2ec0440105..6633c2840b 100644 --- a/tools/create_data.py +++ b/tools/create_data.py @@ -61,7 +61,8 @@ def nuscenes_data_prep(root_path, version (str): Dataset version. dataset_name (str): The dataset class name. out_dir (str): Output directory of the groundtruth database info. - max_sweeps (int): Number of input consecutive frames. Default: 10 + max_sweeps (int, optional): Number of input consecutive frames. + Default: 10 """ nuscenes_converter.create_nuscenes_infos( root_path, info_prefix, version=version, max_sweeps=max_sweeps) @@ -152,8 +153,9 @@ def waymo_data_prep(root_path, info_prefix (str): The prefix of info filenames. out_dir (str): Output directory of the generated info file. workers (int): Number of threads to be used. - max_sweeps (int): Number of input consecutive frames. Default: 5 \ - Here we store pose information of these frames for later use. + max_sweeps (int, optional): Number of input consecutive frames. + Default: 5. Here we store pose information of these frames + for later use. """ from tools.data_converter import waymo_converter as waymo @@ -206,7 +208,7 @@ def waymo_data_prep(root_path, '--out-dir', type=str, default='./data/kitti', - required='False', + required=False, help='name of info pkl') parser.add_argument('--extra-tag', type=str, default='kitti') parser.add_argument( diff --git a/tools/create_data.sh b/tools/create_data.sh index 4007de4095..9a57852f71 100755 --- a/tools/create_data.sh +++ b/tools/create_data.sh @@ -5,8 +5,7 @@ export PYTHONPATH=`pwd`:$PYTHONPATH PARTITION=$1 JOB_NAME=$2 -CONFIG=$3 -WORK_DIR=$4 +DATASET=$3 GPUS=${GPUS:-1} GPUS_PER_NODE=${GPUS_PER_NODE:-1} SRUN_ARGS=${SRUN_ARGS:-""} @@ -19,7 +18,7 @@ srun -p ${PARTITION} \ --ntasks-per-node=${GPUS_PER_NODE} \ --kill-on-bad-exit=1 \ ${SRUN_ARGS} \ - python -u tools/create_data.py kitti \ - --root-path ./data/kitti \ - --out-dir ./data/kitti \ - --extra-tag kitti + python -u tools/create_data.py ${DATASET} \ + --root-path ./data/${DATASET} \ + --out-dir ./data/${DATASET} \ + --extra-tag ${DATASET} diff --git a/tools/data_converter/create_gt_database.py b/tools/data_converter/create_gt_database.py index 7317cedd08..8737dcda40 100644 --- a/tools/data_converter/create_gt_database.py +++ b/tools/data_converter/create_gt_database.py @@ -1,10 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. +import pickle +from os import path as osp + import mmcv import numpy as np -import pickle from mmcv import track_iter_progress from mmcv.ops import roi_align -from os import path as osp from pycocotools import mask as maskUtils from pycocotools.coco import COCO @@ -126,19 +127,19 @@ def create_groundtruth_database(dataset_class_name, dataset_class_name (str): Name of the input dataset. data_path (str): Path of the data. info_prefix (str): Prefix of the info file. - info_path (str): Path of the info file. + info_path (str, optional): Path of the info file. Default: None. - mask_anno_path (str): Path of the mask_anno. + mask_anno_path (str, optional): Path of the mask_anno. Default: None. - used_classes (list[str]): Classes have been used. + used_classes (list[str], optional): Classes have been used. Default: None. - database_save_path (str): Path to save database. + database_save_path (str, optional): Path to save database. Default: None. - db_info_save_path (str): Path to save db_info. + db_info_save_path (str, optional): Path to save db_info. Default: None. - relative_path (bool): Whether to use relative path. + relative_path (bool, optional): Whether to use relative path. Default: True. - with_mask (bool): Whether to use mask. + with_mask (bool, optional): Whether to use mask. Default: False. """ print(f'Create GT Database of {dataset_class_name}') diff --git a/tools/data_converter/indoor_converter.py b/tools/data_converter/indoor_converter.py index 4072397605..d3be367640 100644 --- a/tools/data_converter/indoor_converter.py +++ b/tools/data_converter/indoor_converter.py @@ -1,7 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. +import os + import mmcv import numpy as np -import os from tools.data_converter.s3dis_data_utils import S3DISData, S3DISSegData from tools.data_converter.scannet_data_utils import ScanNetData, ScanNetSegData @@ -19,10 +20,11 @@ def create_indoor_info_file(data_path, Args: data_path (str): Path of the data. - pkl_prefix (str): Prefix of the pkl to be saved. Default: 'sunrgbd'. - save_path (str): Path of the pkl to be saved. Default: None. - use_v1 (bool): Whether to use v1. Default: False. - workers (int): Number of threads to be used. Default: 4. + pkl_prefix (str, optional): Prefix of the pkl to be saved. + Default: 'sunrgbd'. + save_path (str, optional): Path of the pkl to be saved. Default: None. + use_v1 (bool, optional): Whether to use v1. Default: False. + workers (int, optional): Number of threads to be used. Default: 4. """ assert os.path.exists(data_path) assert pkl_prefix in ['sunrgbd', 'scannet', 's3dis'], \ diff --git a/tools/data_converter/kitti_converter.py b/tools/data_converter/kitti_converter.py index eb15aeb46a..40e770f63e 100644 --- a/tools/data_converter/kitti_converter.py +++ b/tools/data_converter/kitti_converter.py @@ -1,11 +1,12 @@ # Copyright (c) OpenMMLab. All rights reserved. +from collections import OrderedDict +from pathlib import Path + import mmcv import numpy as np -from collections import OrderedDict from nuscenes.utils.geometry_utils import view_points -from pathlib import Path -from mmdet3d.core.bbox import box_np_ops +from mmdet3d.core.bbox import box_np_ops, points_cam2img from .kitti_data_utils import get_kitti_image_info, get_waymo_image_info from .nuscenes_converter import post_process_coords @@ -94,9 +95,12 @@ def create_kitti_info_file(data_path, Args: data_path (str): Path of the data root. - pkl_prefix (str): Prefix of the info file to be generated. - save_path (str): Path to save the info file. - relative_path (bool): Whether to use relative path. + pkl_prefix (str, optional): Prefix of the info file to be generated. + Default: 'kitti'. + save_path (str, optional): Path to save the info file. + Default: None. + relative_path (bool, optional): Whether to use relative path. + Default: True. """ imageset_folder = Path(data_path) / 'ImageSets' train_img_ids = _read_imageset_file(str(imageset_folder / 'train.txt')) @@ -113,6 +117,7 @@ def create_kitti_info_file(data_path, training=True, velodyne=True, calib=True, + with_plane=True, image_ids=train_img_ids, relative_path=relative_path) _calculate_num_points_in_gt(data_path, kitti_infos_train, relative_path) @@ -124,6 +129,7 @@ def create_kitti_info_file(data_path, training=True, velodyne=True, calib=True, + with_plane=True, image_ids=val_img_ids, relative_path=relative_path) _calculate_num_points_in_gt(data_path, kitti_infos_val, relative_path) @@ -140,6 +146,7 @@ def create_kitti_info_file(data_path, label_info=False, velodyne=True, calib=True, + with_plane=False, image_ids=test_img_ids, relative_path=relative_path) filename = save_path / f'{pkl_prefix}_infos_test.pkl' @@ -158,10 +165,14 @@ def create_waymo_info_file(data_path, Args: data_path (str): Path of the data root. - pkl_prefix (str): Prefix of the info file to be generated. - save_path (str | None): Path to save the info file. - relative_path (bool): Whether to use relative path. - max_sweeps (int): Max sweeps before the detection frame to be used. + pkl_prefix (str, optional): Prefix of the info file to be generated. + Default: 'waymo'. + save_path (str, optional): Path to save the info file. + Default: None. + relative_path (bool, optional): Whether to use relative path. + Default: True. + max_sweeps (int, optional): Max sweeps before the detection frame + to be used. Default: 5. """ imageset_folder = Path(data_path) / 'ImageSets' train_img_ids = _read_imageset_file(str(imageset_folder / 'train.txt')) @@ -238,11 +249,13 @@ def _create_reduced_point_cloud(data_path, Args: data_path (str): Path of original data. info_path (str): Path of data info. - save_path (str | None): Path to save reduced point cloud data. - Default: None. - back (bool): Whether to flip the points to back. - num_features (int): Number of point features. Default: 4. - front_camera_id (int): The referenced/front camera ID. Default: 2. + save_path (str, optional): Path to save reduced point cloud + data. Default: None. + back (bool, optional): Whether to flip the points to back. + Default: False. + num_features (int, optional): Number of point features. Default: 4. + front_camera_id (int, optional): The referenced/front camera ID. + Default: 2. """ kitti_infos = mmcv.load(info_path) @@ -298,14 +311,16 @@ def create_reduced_point_cloud(data_path, Args: data_path (str): Path of original data. pkl_prefix (str): Prefix of info files. - train_info_path (str | None): Path of training set info. + train_info_path (str, optional): Path of training set info. + Default: None. + val_info_path (str, optional): Path of validation set info. Default: None. - val_info_path (str | None): Path of validation set info. + test_info_path (str, optional): Path of test set info. Default: None. - test_info_path (str | None): Path of test set info. + save_path (str, optional): Path to save reduced point cloud data. Default: None. - save_path (str | None): Path to save reduced point cloud data. - with_back (bool): Whether to flip the points to back. + with_back (bool, optional): Whether to flip the points to back. + Default: False. """ if train_info_path is None: train_info_path = Path(data_path) / f'{pkl_prefix}_infos_train.pkl' @@ -335,7 +350,8 @@ def export_2d_annotation(root_path, info_path, mono3d=True): Args: root_path (str): Root path of the raw data. info_path (str): Path of the info file. - mono3d (bool): Whether to export mono3d annotation. Default: True. + mono3d (bool, optional): Whether to export mono3d annotation. + Default: True. """ # get bbox annotations for camera kitti_infos = mmcv.load(info_path) @@ -381,8 +397,8 @@ def get_2d_boxes(info, occluded, mono3d=True): Args: info: Information of the given sample data. - occluded: Integer (0, 1, 2, 3) indicating occlusion state: \ - 0 = fully visible, 1 = partly occluded, 2 = largely occluded, \ + occluded: Integer (0, 1, 2, 3) indicating occlusion state: + 0 = fully visible, 1 = partly occluded, 2 = largely occluded, 3 = unknown, -1 = DontCare mono3d (bool): Whether to get boxes with mono3d annotation. @@ -471,7 +487,7 @@ def get_2d_boxes(info, occluded, mono3d=True): repro_rec['velo_cam3d'] = -1 # no velocity in KITTI center3d = np.array(loc).reshape([1, 3]) - center2d = box_np_ops.points_cam2img( + center2d = points_cam2img( center3d, camera_intrinsic, with_depth=True) repro_rec['center2d'] = center2d.squeeze().tolist() # normalized center2D + depth @@ -488,7 +504,7 @@ def get_2d_boxes(info, occluded, mono3d=True): def generate_record(ann_rec, x1, y1, x2, y2, sample_data_token, filename): - """Generate one 2D annotation record given various informations on top of + """Generate one 2D annotation record given various information on top of the 2D bounding box coordinates. Args: @@ -503,12 +519,12 @@ def generate_record(ann_rec, x1, y1, x2, y2, sample_data_token, filename): Returns: dict: A sample 2D annotation record. - - file_name (str): flie name + - file_name (str): file name - image_id (str): sample data token - area (float): 2d box area - category_name (str): category name - category_id (int): category id - - bbox (list[float]): left x, top y, dx, dy of 2d box + - bbox (list[float]): left x, top y, x_size, y_size of 2d box - iscrowd (int): whether the area is crowd """ repro_rec = OrderedDict() diff --git a/tools/data_converter/kitti_data_utils.py b/tools/data_converter/kitti_data_utils.py index 01538e065b..8e3dba6f35 100644 --- a/tools/data_converter/kitti_data_utils.py +++ b/tools/data_converter/kitti_data_utils.py @@ -1,9 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. -import numpy as np from collections import OrderedDict from concurrent import futures as futures from os import path as osp from pathlib import Path + +import mmcv +import numpy as np from skimage import io @@ -59,6 +61,17 @@ def get_label_path(idx, relative_path, exist_check, use_prefix_id) +def get_plane_path(idx, + prefix, + training=True, + relative_path=True, + exist_check=True, + info_type='planes', + use_prefix_id=False): + return get_kitti_info_path(idx, prefix, info_type, '.txt', training, + relative_path, exist_check, use_prefix_id) + + def get_velodyne_path(idx, prefix, training=True, @@ -143,6 +156,7 @@ def get_kitti_image_info(path, label_info=True, velodyne=False, calib=False, + with_plane=False, image_ids=7481, extend_matrix=True, num_worker=8, @@ -251,6 +265,13 @@ def map_func(idx): calib_info['Tr_imu_to_velo'] = Tr_imu_to_velo info['calib'] = calib_info + if with_plane: + plane_path = get_plane_path(idx, path, training, relative_path) + if relative_path: + plane_path = str(root_path / plane_path) + lines = mmcv.list_from_file(plane_path) + info['plane'] = np.array([float(i) for i in lines[3].split()]) + if annotations is not None: info['annos'] = annotations add_difficulty_to_annos(info) diff --git a/tools/data_converter/lyft_converter.py b/tools/data_converter/lyft_converter.py index 1fc1555a26..c6a89d0d2b 100644 --- a/tools/data_converter/lyft_converter.py +++ b/tools/data_converter/lyft_converter.py @@ -1,10 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. -import mmcv -import numpy as np import os from logging import warning -from lyft_dataset_sdk.lyftdataset import LyftDataset as Lyft from os import path as osp + +import mmcv +import numpy as np +from lyft_dataset_sdk.lyftdataset import LyftDataset as Lyft from pyquaternion import Quaternion from mmdet3d.datasets import LyftDataset @@ -26,10 +27,10 @@ def create_lyft_infos(root_path, Args: root_path (str): Path of the data root. info_prefix (str): Prefix of the info file to be generated. - version (str): Version of the data. - Default: 'v1.01-train' - max_sweeps (int): Max number of sweeps. - Default: 10 + version (str, optional): Version of the data. + Default: 'v1.01-train'. + max_sweeps (int, optional): Max number of sweeps. + Default: 10. """ lyft = Lyft( data_path=osp.join(root_path, version), @@ -101,9 +102,9 @@ def _fill_trainval_infos(lyft, lyft (:obj:`LyftDataset`): Dataset class in the Lyft dataset. train_scenes (list[str]): Basic information of training scenes. val_scenes (list[str]): Basic information of validation scenes. - test (bool): Whether use the test mode. In the test mode, no + test (bool, optional): Whether use the test mode. In the test mode, no annotations can be accessed. Default: False. - max_sweeps (int): Max number of sweeps. Default: 10. + max_sweeps (int, optional): Max number of sweeps. Default: 10. Returns: tuple[list[dict]]: Information of training set and @@ -192,8 +193,10 @@ def _fill_trainval_infos(lyft, names[i] = LyftDataset.NameMapping[names[i]] names = np.array(names) - # we need to convert rot to SECOND format. - gt_boxes = np.concatenate([locs, dims, -rots - np.pi / 2], axis=1) + # we need to convert box size to + # the format of our lidar coordinate system + # which is x_size, y_size, z_size (corresponding to l, w, h) + gt_boxes = np.concatenate([locs, dims[:, [1, 0, 2]], rots], axis=1) assert len(gt_boxes) == len( annotations), f'{len(gt_boxes)}, {len(annotations)}' info['gt_boxes'] = gt_boxes diff --git a/tools/data_converter/lyft_data_fixer.py b/tools/data_converter/lyft_data_fixer.py index 42070490ca..55103515a2 100644 --- a/tools/data_converter/lyft_data_fixer.py +++ b/tools/data_converter/lyft_data_fixer.py @@ -1,8 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. import argparse -import numpy as np import os +import numpy as np + def fix_lyft(root_folder='./data/lyft', version='v1.01'): # refer to https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/discussion/110000 # noqa diff --git a/tools/data_converter/nuimage_converter.py b/tools/data_converter/nuimage_converter.py index 92be1de3db..a46015a1a0 100644 --- a/tools/data_converter/nuimage_converter.py +++ b/tools/data_converter/nuimage_converter.py @@ -1,11 +1,12 @@ # Copyright (c) OpenMMLab. All rights reserved. import argparse import base64 +from os import path as osp + import mmcv import numpy as np from nuimages import NuImages from nuimages.utils.utils import mask_decode, name_to_index_mapping -from os import path as osp nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', diff --git a/tools/data_converter/nuscenes_converter.py b/tools/data_converter/nuscenes_converter.py index a6ebdbba9c..c6140fcc38 100644 --- a/tools/data_converter/nuscenes_converter.py +++ b/tools/data_converter/nuscenes_converter.py @@ -1,16 +1,17 @@ # Copyright (c) OpenMMLab. All rights reserved. -import mmcv -import numpy as np import os from collections import OrderedDict +from os import path as osp +from typing import List, Tuple, Union + +import mmcv +import numpy as np from nuscenes.nuscenes import NuScenes from nuscenes.utils.geometry_utils import view_points -from os import path as osp from pyquaternion import Quaternion from shapely.geometry import MultiPoint, box -from typing import List, Tuple, Union -from mmdet3d.core.bbox.box_np_ops import points_cam2img +from mmdet3d.core.bbox import points_cam2img from mmdet3d.datasets import NuScenesDataset nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle', @@ -34,10 +35,10 @@ def create_nuscenes_infos(root_path, Args: root_path (str): Path of the data root. info_prefix (str): Prefix of the info file to be generated. - version (str): Version of the data. - Default: 'v1.0-trainval' - max_sweeps (int): Max number of sweeps. - Default: 10 + version (str, optional): Version of the data. + Default: 'v1.0-trainval'. + max_sweeps (int, optional): Max number of sweeps. + Default: 10. """ from nuscenes.nuscenes import NuScenes nusc = NuScenes(version=version, dataroot=root_path, verbose=True) @@ -152,9 +153,9 @@ def _fill_trainval_infos(nusc, nusc (:obj:`NuScenes`): Dataset class in the nuScenes dataset. train_scenes (list[str]): Basic information of training scenes. val_scenes (list[str]): Basic information of validation scenes. - test (bool): Whether use the test mode. In the test mode, no + test (bool, optional): Whether use the test mode. In test mode, no annotations can be accessed. Default: False. - max_sweeps (int): Max number of sweeps. Default: 10. + max_sweeps (int, optional): Max number of sweeps. Default: 10. Returns: tuple[list[dict]]: Information of training set and validation set @@ -249,8 +250,10 @@ def _fill_trainval_infos(nusc, if names[i] in NuScenesDataset.NameMapping: names[i] = NuScenesDataset.NameMapping[names[i]] names = np.array(names) - # we need to convert rot to SECOND format. - gt_boxes = np.concatenate([locs, dims, -rots - np.pi / 2], axis=1) + # we need to convert box size to + # the format of our lidar coordinate system + # which is x_size, y_size, z_size (corresponding to l, w, h) + gt_boxes = np.concatenate([locs, dims[:, [1, 0, 2]], rots], axis=1) assert len(gt_boxes) == len( annotations), f'{len(gt_boxes)}, {len(annotations)}' info['gt_boxes'] = gt_boxes @@ -289,7 +292,7 @@ def obtain_sensor2top(nusc, e2g_t (np.ndarray): Translation from ego to global in shape (1, 3). e2g_r_mat (np.ndarray): Rotation matrix from ego to global in shape (3, 3). - sensor_type (str): Sensor to calibrate. Default: 'lidar'. + sensor_type (str, optional): Sensor to calibrate. Default: 'lidar'. Returns: sweep (dict): Sweep information after transformation. @@ -338,7 +341,8 @@ def export_2d_annotation(root_path, info_path, version, mono3d=True): root_path (str): Root path of the raw data. info_path (str): Path of the info file. version (str): Dataset version. - mono3d (bool): Whether to export mono3d annotation. Default: True. + mono3d (bool, optional): Whether to export mono3d annotation. + Default: True. """ # get bbox annotations for camera camera_types = [ @@ -402,7 +406,7 @@ def get_2d_boxes(nusc, """Get the 2D annotation records for a given `sample_data_token`. Args: - sample_data_token (str): Sample data token belonging to a camera \ + sample_data_token (str): Sample data token belonging to a camera keyframe. visibilities (list[str]): Visibility filter. mono3d (bool): Whether to get boxes with mono3d annotation. @@ -562,7 +566,7 @@ def post_process_coords( def generate_record(ann_rec: dict, x1: float, y1: float, x2: float, y2: float, sample_data_token: str, filename: str) -> OrderedDict: - """Generate one 2D annotation record given various informations on top of + """Generate one 2D annotation record given various information on top of the 2D bounding box coordinates. Args: @@ -577,7 +581,7 @@ def generate_record(ann_rec: dict, x1: float, y1: float, x2: float, y2: float, Returns: dict: A sample 2D annotation record. - - file_name (str): flie name + - file_name (str): file name - image_id (str): sample data token - area (float): 2d box area - category_name (str): category name diff --git a/tools/data_converter/s3dis_data_utils.py b/tools/data_converter/s3dis_data_utils.py index d2b6b773e9..0f5bea8399 100644 --- a/tools/data_converter/s3dis_data_utils.py +++ b/tools/data_converter/s3dis_data_utils.py @@ -1,10 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. -import mmcv -import numpy as np import os from concurrent import futures as futures from os import path as osp +import mmcv +import numpy as np + class S3DISData(object): """S3DIS data. @@ -13,7 +14,7 @@ class S3DISData(object): Args: root_path (str): Root path of the raw data. - split (str): Set split type of the data. Default: 'Area_1'. + split (str, optional): Set split type of the data. Default: 'Area_1'. """ def __init__(self, root_path, split='Area_1'): @@ -48,9 +49,11 @@ def get_infos(self, num_workers=4, has_label=True, sample_id_list=None): This method gets information from the raw data. Args: - num_workers (int): Number of threads to be used. Default: 4. - has_label (bool): Whether the data has label. Default: True. - sample_id_list (list[int]): Index list of the sample. + num_workers (int, optional): Number of threads to be used. + Default: 4. + has_label (bool, optional): Whether the data has label. + Default: True. + sample_id_list (list[int], optional): Index list of the sample. Default: None. Returns: @@ -154,10 +157,11 @@ class S3DISSegData(object): Args: data_root (str): Root path of the raw data. ann_file (str): The generated scannet infos. - split (str): Set split type of the data. Default: 'train'. - num_points (int): Number of points in each data input. Default: 8192. - label_weight_func (function): Function to compute the label weight. - Default: None. + split (str, optional): Set split type of the data. Default: 'train'. + num_points (int, optional): Number of points in each data input. + Default: 8192. + label_weight_func (function, optional): Function to compute the + label weight. Default: None. """ def __init__(self, @@ -209,7 +213,7 @@ def _convert_to_label(self, mask): return label def get_scene_idxs_and_label_weight(self): - """Compute scene_idxs for data sampling and label weight for loss \ + """Compute scene_idxs for data sampling and label weight for loss calculation. We sample more times for scenes with more points. Label_weight is diff --git a/tools/data_converter/scannet_data_utils.py b/tools/data_converter/scannet_data_utils.py index a437fe01ce..7fb2efdd5e 100644 --- a/tools/data_converter/scannet_data_utils.py +++ b/tools/data_converter/scannet_data_utils.py @@ -1,10 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. -import mmcv -import numpy as np import os from concurrent import futures as futures from os import path as osp +import mmcv +import numpy as np + class ScanNetData(object): """ScanNet data. @@ -13,7 +14,7 @@ class ScanNetData(object): Args: root_path (str): Root path of the raw data. - split (str): Set split type of the data. Default: 'train'. + split (str, optional): Set split type of the data. Default: 'train'. """ def __init__(self, root_path, split='train'): @@ -90,9 +91,11 @@ def get_infos(self, num_workers=4, has_label=True, sample_id_list=None): This method gets information from the raw data. Args: - num_workers (int): Number of threads to be used. Default: 4. - has_label (bool): Whether the data has label. Default: True. - sample_id_list (list[int]): Index list of the sample. + num_workers (int, optional): Number of threads to be used. + Default: 4. + has_label (bool, optional): Whether the data has label. + Default: True. + sample_id_list (list[int], optional): Index list of the sample. Default: None. Returns: @@ -201,10 +204,11 @@ class ScanNetSegData(object): Args: data_root (str): Root path of the raw data. ann_file (str): The generated scannet infos. - split (str): Set split type of the data. Default: 'train'. - num_points (int): Number of points in each data input. Default: 8192. - label_weight_func (function): Function to compute the label weight. - Default: None. + split (str, optional): Set split type of the data. Default: 'train'. + num_points (int, optional): Number of points in each data input. + Default: 8192. + label_weight_func (function, optional): Function to compute the + label weight. Default: None. """ def __init__(self, @@ -261,7 +265,7 @@ def _convert_to_label(self, mask): return label def get_scene_idxs_and_label_weight(self): - """Compute scene_idxs for data sampling and label weight for loss \ + """Compute scene_idxs for data sampling and label weight for loss calculation. We sample more times for scenes with more points. Label_weight is diff --git a/tools/data_converter/sunrgbd_data_utils.py b/tools/data_converter/sunrgbd_data_utils.py index 9f8a502e90..152ea42f4f 100644 --- a/tools/data_converter/sunrgbd_data_utils.py +++ b/tools/data_converter/sunrgbd_data_utils.py @@ -1,8 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. -import mmcv -import numpy as np from concurrent import futures as futures from os import path as osp + +import mmcv +import numpy as np from scipy import io as sio @@ -42,18 +43,20 @@ def __init__(self, line): self.ymax = data[2] + data[4] self.box2d = np.array([self.xmin, self.ymin, self.xmax, self.ymax]) self.centroid = np.array([data[5], data[6], data[7]]) - self.w = data[8] - self.l = data[9] # noqa: E741 - self.h = data[10] + self.width = data[8] + self.length = data[9] + self.height = data[10] + # data[9] is x_size (length), data[8] is y_size (width), data[10] is + # z_size (height) in our depth coordinate system, + # l corresponds to the size along the x axis + self.size = np.array([data[9], data[8], data[10]]) * 2 self.orientation = np.zeros((3, )) self.orientation[0] = data[11] self.orientation[1] = data[12] - self.heading_angle = -1 * np.arctan2(self.orientation[1], - self.orientation[0]) - self.box3d = np.concatenate([ - self.centroid, - np.array([self.l * 2, self.w * 2, self.h * 2, self.heading_angle]) - ]) + self.heading_angle = np.arctan2(self.orientation[1], + self.orientation[0]) + self.box3d = np.concatenate( + [self.centroid, self.size, self.heading_angle[None]]) class SUNRGBDData(object): @@ -63,8 +66,8 @@ class SUNRGBDData(object): Args: root_path (str): Root path of the raw data. - split (str): Set split type of the data. Default: 'train'. - use_v1 (bool): Whether to use v1. Default: False. + split (str, optional): Set split type of the data. Default: 'train'. + use_v1 (bool, optional): Whether to use v1. Default: False. """ def __init__(self, root_path, split='train', use_v1=False): @@ -129,9 +132,11 @@ def get_infos(self, num_workers=4, has_label=True, sample_id_list=None): This method gets information from the raw data. Args: - num_workers (int): Number of threads to be used. Default: 4. - has_label (bool): Whether the data has label. Default: True. - sample_id_list (list[int]): Index list of the sample. + num_workers (int, optional): Number of threads to be used. + Default: 4. + has_label (bool, optional): Whether the data has label. + Default: True. + sample_id_list (list[int], optional): Index list of the sample. Default: None. Returns: @@ -192,7 +197,7 @@ def process_single_scene(sample_idx): ], axis=0) annotations['dimensions'] = 2 * np.array([ - [obj.l, obj.w, obj.h] for obj in obj_list + [obj.length, obj.width, obj.height] for obj in obj_list if obj.classname in self.cat2label.keys() ]) # lwh (depth) format annotations['rotation_y'] = np.array([ diff --git a/tools/data_converter/waymo_converter.py b/tools/data_converter/waymo_converter.py index fc2ae013b5..a4ff5fd7bb 100644 --- a/tools/data_converter/waymo_converter.py +++ b/tools/data_converter/waymo_converter.py @@ -10,11 +10,12 @@ 'Please run "pip install waymo-open-dataset-tf-2-1-0==1.2.0" ' 'to install the official devkit first.') +from glob import glob +from os.path import join + import mmcv import numpy as np import tensorflow as tf -from glob import glob -from os.path import join from waymo_open_dataset.utils import range_image_utils, transform_utils from waymo_open_dataset.utils.frame_utils import \ parse_range_image_and_camera_projection @@ -31,8 +32,8 @@ class Waymo2KITTI(object): save_dir (str): Directory to save data in KITTI format. prefix (str): Prefix of filename. In general, 0 for training, 1 for validation and 2 for testing. - workers (str): Number of workers for the parallel process. - test_mode (bool): Whether in the test_mode. Default: False. + workers (int, optional): Number of workers for the parallel process. + test_mode (bool, optional): Whether in the test_mode. Default: False. """ def __init__(self, @@ -402,8 +403,8 @@ def convert_range_image_to_point_cloud(self, camera projections corresponding with two returns. range_image_top_pose (:obj:`Transform`): Range image pixel pose for top lidar. - ri_index (int): 0 for the first return, 1 for the second return. - Default: 0. + ri_index (int, optional): 0 for the first return, + 1 for the second return. Default: 0. Returns: tuple[list[np.ndarray]]: (List of points with shape [N, 3], diff --git a/tools/deployment/mmdet3d2torchserve.py b/tools/deployment/mmdet3d2torchserve.py new file mode 100644 index 0000000000..df7e6084a3 --- /dev/null +++ b/tools/deployment/mmdet3d2torchserve.py @@ -0,0 +1,111 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from argparse import ArgumentParser, Namespace +from pathlib import Path +from tempfile import TemporaryDirectory + +import mmcv + +try: + from model_archiver.model_packaging import package_model + from model_archiver.model_packaging_utils import ModelExportUtils +except ImportError: + package_model = None + + +def mmdet3d2torchserve( + config_file: str, + checkpoint_file: str, + output_folder: str, + model_name: str, + model_version: str = '1.0', + force: bool = False, +): + """Converts MMDetection3D model (config + checkpoint) to TorchServe `.mar`. + + Args: + config_file (str): + In MMDetection3D config format. + The contents vary for each task repository. + checkpoint_file (str): + In MMDetection3D checkpoint format. + The contents vary for each task repository. + output_folder (str): + Folder where `{model_name}.mar` will be created. + The file created will be in TorchServe archive format. + model_name (str): + If not None, used for naming the `{model_name}.mar` file + that will be created under `output_folder`. + If None, `{Path(checkpoint_file).stem}` will be used. + model_version (str, optional): + Model's version. Default: '1.0'. + force (bool, optional): + If True, if there is an existing `{model_name}.mar` + file under `output_folder` it will be overwritten. + Default: False. + """ + mmcv.mkdir_or_exist(output_folder) + + config = mmcv.Config.fromfile(config_file) + + with TemporaryDirectory() as tmpdir: + config.dump(f'{tmpdir}/config.py') + + args = Namespace( + **{ + 'model_file': f'{tmpdir}/config.py', + 'serialized_file': checkpoint_file, + 'handler': f'{Path(__file__).parent}/mmdet3d_handler.py', + 'model_name': model_name or Path(checkpoint_file).stem, + 'version': model_version, + 'export_path': output_folder, + 'force': force, + 'requirements_file': None, + 'extra_files': None, + 'runtime': 'python', + 'archive_format': 'default' + }) + manifest = ModelExportUtils.generate_manifest_json(args) + package_model(args, manifest) + + +def parse_args(): + parser = ArgumentParser( + description='Convert MMDetection models to TorchServe `.mar` format.') + parser.add_argument('config', type=str, help='config file path') + parser.add_argument('checkpoint', type=str, help='checkpoint file path') + parser.add_argument( + '--output-folder', + type=str, + required=True, + help='Folder where `{model_name}.mar` will be created.') + parser.add_argument( + '--model-name', + type=str, + default=None, + help='If not None, used for naming the `{model_name}.mar`' + 'file that will be created under `output_folder`.' + 'If None, `{Path(checkpoint_file).stem}` will be used.') + parser.add_argument( + '--model-version', + type=str, + default='1.0', + help='Number used for versioning.') + parser.add_argument( + '-f', + '--force', + action='store_true', + help='overwrite the existing `{model_name}.mar`') + args = parser.parse_args() + + return args + + +if __name__ == '__main__': + args = parse_args() + + if package_model is None: + raise ImportError('`torch-model-archiver` is required.' + 'Try: pip install torch-model-archiver') + + mmdet3d2torchserve(args.config, args.checkpoint, args.output_folder, + args.model_name, args.model_version, args.force) diff --git a/tools/deployment/mmdet3d_handler.py b/tools/deployment/mmdet3d_handler.py new file mode 100644 index 0000000000..8b526cdf5f --- /dev/null +++ b/tools/deployment/mmdet3d_handler.py @@ -0,0 +1,120 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import base64 +import os + +import numpy as np +import torch +from ts.torch_handler.base_handler import BaseHandler + +from mmdet3d.apis import inference_detector, init_model +from mmdet3d.core.points import get_points_type + + +class MMdet3dHandler(BaseHandler): + """MMDetection3D Handler used in TorchServe. + + Handler to load models in MMDetection3D, and it will process data to get + predicted results. For now, it only supports SECOND. + """ + threshold = 0.5 + load_dim = 4 + use_dim = [0, 1, 2, 3] + coord_type = 'LIDAR' + attribute_dims = None + + def initialize(self, context): + """Initialize function loads the model in MMDetection3D. + + Args: + context (context): It is a JSON Object containing information + pertaining to the model artifacts parameters. + """ + properties = context.system_properties + self.map_location = 'cuda' if torch.cuda.is_available() else 'cpu' + self.device = torch.device(self.map_location + ':' + + str(properties.get('gpu_id')) if torch.cuda. + is_available() else self.map_location) + self.manifest = context.manifest + + model_dir = properties.get('model_dir') + serialized_file = self.manifest['model']['serializedFile'] + checkpoint = os.path.join(model_dir, serialized_file) + self.config_file = os.path.join(model_dir, 'config.py') + self.model = init_model(self.config_file, checkpoint, self.device) + self.initialized = True + + def preprocess(self, data): + """Preprocess function converts data into LiDARPoints class. + + Args: + data (List): Input data from the request. + + Returns: + `LiDARPoints` : The preprocess function returns the input + point cloud data as LiDARPoints class. + """ + for row in data: + # Compat layer: normally the envelope should just return the data + # directly, but older versions of Torchserve didn't have envelope. + pts = row.get('data') or row.get('body') + if isinstance(pts, str): + pts = base64.b64decode(pts) + + points = np.frombuffer(pts, dtype=np.float32) + points = points.reshape(-1, self.load_dim) + points = points[:, self.use_dim] + points_class = get_points_type(self.coord_type) + points = points_class( + points, + points_dim=points.shape[-1], + attribute_dims=self.attribute_dims) + + return points + + def inference(self, data): + """Inference Function. + + This function is used to make a prediction call on the + given input request. + + Args: + data (`LiDARPoints`): LiDARPoints class passed to make + the inference request. + + Returns: + List(dict) : The predicted result is returned in this function. + """ + results, _ = inference_detector(self.model, data) + return results + + def postprocess(self, data): + """Postprocess function. + + This function makes use of the output from the inference and + converts it into a torchserve supported response output. + + Args: + data (List[dict]): The data received from the prediction + output of the model. + + Returns: + List: The post process function returns a list of the predicted + output. + """ + output = [] + for pts_index, result in enumerate(data): + output.append([]) + if 'pts_bbox' in result.keys(): + pred_bboxes = result['pts_bbox']['boxes_3d'].tensor.numpy() + pred_scores = result['pts_bbox']['scores_3d'].numpy() + else: + pred_bboxes = result['boxes_3d'].tensor.numpy() + pred_scores = result['scores_3d'].numpy() + + index = pred_scores > self.threshold + bbox_coords = pred_bboxes[index].tolist() + score = pred_scores[index].tolist() + + output[pts_index].append({'3dbbox': bbox_coords, 'score': score}) + + return output diff --git a/tools/deployment/test_torchserver.py b/tools/deployment/test_torchserver.py new file mode 100644 index 0000000000..613f9e4f7b --- /dev/null +++ b/tools/deployment/test_torchserver.py @@ -0,0 +1,56 @@ +from argparse import ArgumentParser + +import numpy as np +import requests + +from mmdet3d.apis import inference_detector, init_model + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument('pcd', help='Point cloud file') + parser.add_argument('config', help='Config file') + parser.add_argument('checkpoint', help='Checkpoint file') + parser.add_argument('model_name', help='The model name in the server') + parser.add_argument( + '--inference-addr', + default='127.0.0.1:8080', + help='Address and port of the inference server') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference') + parser.add_argument( + '--score-thr', type=float, default=0.5, help='3d bbox score threshold') + args = parser.parse_args() + return args + + +def parse_result(input): + bbox = input[0]['3dbbox'] + result = np.array(bbox) + return result + + +def main(args): + # build the model from a config file and a checkpoint file + model = init_model(args.config, args.checkpoint, device=args.device) + # test a single point cloud file + model_result, _ = inference_detector(model, args.pcd) + # filter the 3d bboxes whose scores > 0.5 + if 'pts_bbox' in model_result[0].keys(): + pred_bboxes = model_result[0]['pts_bbox']['boxes_3d'].tensor.numpy() + pred_scores = model_result[0]['pts_bbox']['scores_3d'].numpy() + else: + pred_bboxes = model_result[0]['boxes_3d'].tensor.numpy() + pred_scores = model_result[0]['scores_3d'].numpy() + model_result = pred_bboxes[pred_scores > 0.5] + + url = 'http://' + args.inference_addr + '/predictions/' + args.model_name + with open(args.pcd, 'rb') as points: + response = requests.post(url, points) + server_result = parse_result(response.json()) + assert np.allclose(model_result, server_result) + + +if __name__ == '__main__': + args = parse_args() + main(args) diff --git a/tools/misc/browse_dataset.py b/tools/misc/browse_dataset.py index a1782c806f..3c627c0808 100644 --- a/tools/misc/browse_dataset.py +++ b/tools/misc/browse_dataset.py @@ -1,9 +1,12 @@ # Copyright (c) OpenMMLab. All rights reserved. import argparse -import numpy as np import warnings -from mmcv import Config, DictAction, mkdir_or_exist, track_iter_progress from os import path as osp +from pathlib import Path + +import mmcv +import numpy as np +from mmcv import Config, DictAction, mkdir_or_exist from mmdet3d.core.bbox import (Box3DMode, CameraInstance3DBoxes, Coord3DMode, DepthInstance3DBoxes, LiDARInstance3DBoxes) @@ -31,6 +34,10 @@ def parse_args(): type=str, choices=['det', 'seg', 'multi_modality-det', 'mono-det'], help='Determine the visualization method depending on the task.') + parser.add_argument( + '--aug', + action='store_true', + help='Whether to visualize augmented datasets or original dataset.') parser.add_argument( '--online', action='store_true', @@ -50,8 +57,9 @@ def parse_args(): return args -def build_data_cfg(config_path, skip_type, cfg_options): +def build_data_cfg(config_path, skip_type, aug, cfg_options): """Build data config for loading visualization data.""" + cfg = Config.fromfile(config_path) if cfg_options is not None: cfg.merge_from_dict(cfg_options) @@ -63,10 +71,17 @@ def build_data_cfg(config_path, skip_type, cfg_options): if cfg.data.train['type'] == 'ConcatDataset': cfg.data.train = cfg.data.train.datasets[0] train_data_cfg = cfg.data.train - # eval_pipeline purely consists of loading functions - # use eval_pipeline for data loading + + if aug: + show_pipeline = cfg.train_pipeline + else: + show_pipeline = cfg.eval_pipeline + for i in range(len(cfg.train_pipeline)): + if cfg.train_pipeline[i]['type'] == 'LoadAnnotations3D': + show_pipeline.insert(i, cfg.train_pipeline[i]) + train_data_cfg['pipeline'] = [ - x for x in cfg.eval_pipeline if x['type'] not in skip_type + x for x in show_pipeline if x['type'] not in skip_type ] return cfg @@ -83,13 +98,14 @@ def to_depth_mode(points, bboxes): return points, bboxes -def show_det_data(idx, dataset, out_dir, filename, show=False): +def show_det_data(input, out_dir, show=False): """Visualize 3D point cloud and 3D bboxes.""" - example = dataset.prepare_train_data(idx) - points = example['points']._data.numpy() - gt_bboxes = dataset.get_ann_info(idx)['gt_bboxes_3d'].tensor - if dataset.box_mode_3d != Box3DMode.DEPTH: + img_metas = input['img_metas']._data + points = input['points']._data.numpy() + gt_bboxes = input['gt_bboxes_3d']._data.tensor + if img_metas['box_mode_3d'] != Box3DMode.DEPTH: points, gt_bboxes = to_depth_mode(points, gt_bboxes) + filename = osp.splitext(osp.basename(img_metas['pts_filename']))[0] show_result( points, gt_bboxes.clone(), @@ -100,42 +116,35 @@ def show_det_data(idx, dataset, out_dir, filename, show=False): snapshot=True) -def show_seg_data(idx, dataset, out_dir, filename, show=False): +def show_seg_data(input, out_dir, show=False): """Visualize 3D point cloud and segmentation mask.""" - example = dataset.prepare_train_data(idx) - points = example['points']._data.numpy() - gt_seg = example['pts_semantic_mask']._data.numpy() + img_metas = input['img_metas']._data + points = input['points']._data.numpy() + gt_seg = input['pts_semantic_mask']._data.numpy() + filename = osp.splitext(osp.basename(img_metas['pts_filename']))[0] show_seg_result( points, gt_seg.copy(), None, out_dir, filename, - np.array(dataset.PALETTE), - dataset.ignore_index, + np.array(img_metas['PALETTE']), + img_metas['ignore_index'], show=show, snapshot=True) -def show_proj_bbox_img(idx, - dataset, - out_dir, - filename, - show=False, - is_nus_mono=False): +def show_proj_bbox_img(input, out_dir, show=False, is_nus_mono=False): """Visualize 3D bboxes on 2D image by projection.""" - try: - example = dataset.prepare_train_data(idx) - except AttributeError: # for Mono-3D datasets - example = dataset.prepare_train_img(idx) - gt_bboxes = dataset.get_ann_info(idx)['gt_bboxes_3d'] - img_metas = example['img_metas']._data - img = example['img']._data.numpy() + gt_bboxes = input['gt_bboxes_3d']._data + img_metas = input['img_metas']._data + img = input['img']._data.numpy() # need to transpose channel to first dim img = img.transpose(1, 2, 0) # no 3D gt bboxes, just show img if gt_bboxes.tensor.shape[0] == 0: gt_bboxes = None + filename = Path(img_metas['filename']).name if isinstance(gt_bboxes, DepthInstance3DBoxes): show_multi_modality_result( img, @@ -183,53 +192,34 @@ def main(): if args.output_dir is not None: mkdir_or_exist(args.output_dir) - cfg = build_data_cfg(args.config, args.skip_type, args.cfg_options) + cfg = build_data_cfg(args.config, args.skip_type, args.aug, + args.cfg_options) try: dataset = build_dataset( cfg.data.train, default_args=dict(filter_empty_gt=False)) except TypeError: # seg dataset doesn't have `filter_empty_gt` key dataset = build_dataset(cfg.data.train) - data_infos = dataset.data_infos - dataset_type = cfg.dataset_type + dataset_type = cfg.dataset_type # configure visualization mode vis_task = args.task # 'det', 'seg', 'multi_modality-det', 'mono-det' + progress_bar = mmcv.ProgressBar(len(dataset)) - for idx, data_info in enumerate(track_iter_progress(data_infos)): - if dataset_type in ['KittiDataset', 'WaymoDataset']: - data_path = data_info['point_cloud']['velodyne_path'] - elif dataset_type in [ - 'ScanNetDataset', 'SUNRGBDDataset', 'ScanNetSegDataset', - 'S3DISSegDataset', 'S3DISDataset' - ]: - data_path = data_info['pts_path'] - elif dataset_type in ['NuScenesDataset', 'LyftDataset']: - data_path = data_info['lidar_path'] - elif dataset_type in ['NuScenesMonoDataset']: - data_path = data_info['file_name'] - else: - raise NotImplementedError( - f'unsupported dataset type {dataset_type}') - - file_name = osp.splitext(osp.basename(data_path))[0] - + for input in dataset: if vis_task in ['det', 'multi_modality-det']: # show 3D bboxes on 3D point clouds - show_det_data( - idx, dataset, args.output_dir, file_name, show=args.online) + show_det_data(input, args.output_dir, show=args.online) if vis_task in ['multi_modality-det', 'mono-det']: # project 3D bboxes to 2D image show_proj_bbox_img( - idx, - dataset, + input, args.output_dir, - file_name, show=args.online, is_nus_mono=(dataset_type == 'NuScenesMonoDataset')) elif vis_task in ['seg']: # show 3D segmentation mask on 3D point clouds - show_seg_data( - idx, dataset, args.output_dir, file_name, show=args.online) + show_seg_data(input, args.output_dir, show=args.online) + progress_bar.update() if __name__ == '__main__': diff --git a/tools/misc/fuse_conv_bn.py b/tools/misc/fuse_conv_bn.py index d4e22018d6..4656e4dfb0 100644 --- a/tools/misc/fuse_conv_bn.py +++ b/tools/misc/fuse_conv_bn.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import argparse + import torch from mmcv.runner import save_checkpoint from torch import nn as nn diff --git a/tools/misc/print_config.py b/tools/misc/print_config.py index 3100fc324b..c3538ef56b 100644 --- a/tools/misc/print_config.py +++ b/tools/misc/print_config.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import argparse + from mmcv import Config, DictAction diff --git a/tools/misc/visualize_results.py b/tools/misc/visualize_results.py index 302adc50ec..c59445f6ee 100644 --- a/tools/misc/visualize_results.py +++ b/tools/misc/visualize_results.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import argparse + import mmcv from mmcv import Config diff --git a/tools/model_converters/convert_h3dnet_checkpoints.py b/tools/model_converters/convert_h3dnet_checkpoints.py index 9368a87182..2ede340ae8 100644 --- a/tools/model_converters/convert_h3dnet_checkpoints.py +++ b/tools/model_converters/convert_h3dnet_checkpoints.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import argparse import tempfile + import torch from mmcv import Config from mmcv.runner import load_state_dict diff --git a/tools/model_converters/convert_votenet_checkpoints.py b/tools/model_converters/convert_votenet_checkpoints.py index 33792b00dd..7264e319bd 100644 --- a/tools/model_converters/convert_votenet_checkpoints.py +++ b/tools/model_converters/convert_votenet_checkpoints.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import argparse import tempfile + import torch from mmcv import Config from mmcv.runner import load_state_dict diff --git a/tools/model_converters/publish_model.py b/tools/model_converters/publish_model.py index 318fd46a65..e2660578af 100644 --- a/tools/model_converters/publish_model.py +++ b/tools/model_converters/publish_model.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import argparse import subprocess + import torch diff --git a/tools/model_converters/regnet2mmdet.py b/tools/model_converters/regnet2mmdet.py index 9dee3c878a..fbf8c8f33a 100644 --- a/tools/model_converters/regnet2mmdet.py +++ b/tools/model_converters/regnet2mmdet.py @@ -1,8 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. import argparse -import torch from collections import OrderedDict +import torch + def convert_stem(model_key, model_weight, state_dict, converted_names): new_key = model_key.replace('stem.conv', 'conv1') diff --git a/tools/test.py b/tools/test.py index a2ac25a315..cff933a89e 100644 --- a/tools/test.py +++ b/tools/test.py @@ -1,9 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. import argparse -import mmcv import os -import torch import warnings + +import mmcv +import torch from mmcv import Config, DictAction from mmcv.cnn import fuse_conv_bn from mmcv.parallel import MMDataParallel, MMDistributedDataParallel @@ -16,6 +17,13 @@ from mmdet.apis import multi_gpu_test, set_random_seed from mmdet.datasets import replace_ImageToTensor +try: + # If mmdet version > 2.20.0, setup_multi_processes would be imported and + # used from mmdet instead of mmdet3d. + from mmdet.utils import setup_multi_processes +except ImportError: + from mmdet3d.utils import setup_multi_processes + def parse_args(): parser = argparse.ArgumentParser( @@ -129,6 +137,10 @@ def main(): cfg = Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) + + # set multi-process settings + setup_multi_processes(cfg) + # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True diff --git a/tools/train.py b/tools/train.py index 2510fdac83..eb0ad4c01f 100644 --- a/tools/train.py +++ b/tools/train.py @@ -1,26 +1,33 @@ # Copyright (c) OpenMMLab. All rights reserved. from __future__ import division - import argparse import copy -import mmcv import os import time -import torch import warnings +from os import path as osp + +import mmcv +import torch from mmcv import Config, DictAction from mmcv.runner import get_dist_info, init_dist -from os import path as osp from mmdet import __version__ as mmdet_version from mmdet3d import __version__ as mmdet3d_version -from mmdet3d.apis import train_model +from mmdet3d.apis import init_random_seed, train_model from mmdet3d.datasets import build_dataset from mmdet3d.models import build_model from mmdet3d.utils import collect_env, get_root_logger from mmdet.apis import set_random_seed from mmseg import __version__ as mmseg_version +try: + # If mmdet version > 2.20.0, setup_multi_processes would be imported and + # used from mmdet instead of mmdet3d. + from mmdet.utils import setup_multi_processes +except ImportError: + from mmdet3d.utils import setup_multi_processes + def parse_args(): parser = argparse.ArgumentParser(description='Train a detector') @@ -104,6 +111,9 @@ def main(): if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) + # set multi-process settings + setup_multi_processes(cfg) + # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True @@ -180,12 +190,12 @@ def main(): logger.info(f'Config:\n{cfg.pretty_text}') # set random seeds - if args.seed is not None: - logger.info(f'Set random seed to {args.seed}, ' - f'deterministic: {args.deterministic}') - set_random_seed(args.seed, deterministic=args.deterministic) - cfg.seed = args.seed - meta['seed'] = args.seed + seed = init_random_seed(args.seed) + logger.info(f'Set random seed to {seed}, ' + f'deterministic: {args.deterministic}') + set_random_seed(seed, deterministic=args.deterministic) + cfg.seed = seed + meta['seed'] = seed meta['exp_name'] = osp.basename(args.config) model = build_model( diff --git a/tools/update_data_coords.py b/tools/update_data_coords.py new file mode 100644 index 0000000000..94728bcc6f --- /dev/null +++ b/tools/update_data_coords.py @@ -0,0 +1,168 @@ +import argparse +import time +from os import path as osp + +import mmcv +import numpy as np + +from mmdet3d.core.bbox import limit_period + + +def update_sunrgbd_infos(root_dir, out_dir, pkl_files): + print(f'{pkl_files} will be modified because ' + f'of the refactor of the Depth coordinate system.') + if root_dir == out_dir: + print(f'Warning, you are overwriting ' + f'the original data under {root_dir}.') + time.sleep(3) + for pkl_file in pkl_files: + in_path = osp.join(root_dir, pkl_file) + print(f'Reading from input file: {in_path}.') + a = mmcv.load(in_path) + print('Start updating:') + for item in mmcv.track_iter_progress(a): + if 'rotation_y' in item['annos']: + item['annos']['rotation_y'] = -item['annos']['rotation_y'] + item['annos']['gt_boxes_upright_depth'][:, -1:] = \ + -item['annos']['gt_boxes_upright_depth'][:, -1:] + + out_path = osp.join(out_dir, pkl_file) + print(f'Writing to output file: {out_path}.') + mmcv.dump(a, out_path, 'pkl') + + +def update_outdoor_dbinfos(root_dir, out_dir, pkl_files): + print(f'{pkl_files} will be modified because ' + f'of the refactor of the LIDAR coordinate system.') + if root_dir == out_dir: + print(f'Warning, you are overwriting ' + f'the original data under {root_dir}.') + time.sleep(3) + for pkl_file in pkl_files: + in_path = osp.join(root_dir, pkl_file) + print(f'Reading from input file: {in_path}.') + a = mmcv.load(in_path) + print('Start updating:') + for k in a.keys(): + print(f'Updating samples of class {k}:') + for item in mmcv.track_iter_progress(a[k]): + boxes = item['box3d_lidar'].copy() + # swap l, w (or dx, dy) + item['box3d_lidar'][3] = boxes[4] + item['box3d_lidar'][4] = boxes[3] + # change yaw + item['box3d_lidar'][6] = -boxes[6] - np.pi / 2 + item['box3d_lidar'][6] = limit_period( + item['box3d_lidar'][6], period=np.pi * 2) + + out_path = osp.join(out_dir, pkl_file) + print(f'Writing to output file: {out_path}.') + mmcv.dump(a, out_path, 'pkl') + + +def update_nuscenes_or_lyft_infos(root_dir, out_dir, pkl_files): + + print(f'{pkl_files} will be modified because ' + f'of the refactor of the LIDAR coordinate system.') + if root_dir == out_dir: + print(f'Warning, you are overwriting ' + f'the original data under {root_dir}.') + time.sleep(3) + for pkl_file in pkl_files: + in_path = osp.join(root_dir, pkl_file) + print(f'Reading from input file: {in_path}.') + a = mmcv.load(in_path) + print('Start updating:') + for item in mmcv.track_iter_progress(a['infos']): + boxes = item['gt_boxes'].copy() + # swap l, w (or dx, dy) + item['gt_boxes'][:, 3] = boxes[:, 4] + item['gt_boxes'][:, 4] = boxes[:, 3] + # change yaw + item['gt_boxes'][:, 6] = -boxes[:, 6] - np.pi / 2 + item['gt_boxes'][:, 6] = limit_period( + item['gt_boxes'][:, 6], period=np.pi * 2) + + out_path = osp.join(out_dir, pkl_file) + print(f'Writing to output file: {out_path}.') + mmcv.dump(a, out_path, 'pkl') + + +parser = argparse.ArgumentParser(description='Arg parser for data coords ' + 'update due to coords sys refactor.') +parser.add_argument('dataset', metavar='kitti', help='name of the dataset') +parser.add_argument( + '--root-dir', + type=str, + default='./data/kitti', + help='specify the root dir of dataset') +parser.add_argument( + '--version', + type=str, + default='v1.0', + required=False, + help='specify the dataset version, no need for kitti') +parser.add_argument( + '--out-dir', + type=str, + default=None, + required=False, + help='name of info pkl') +args = parser.parse_args() + +if __name__ == '__main__': + if args.out_dir is None: + args.out_dir = args.root_dir + if args.dataset == 'kitti': + # KITTI infos is in CAM coord sys (unchanged) + # KITTI dbinfos is in LIDAR coord sys (changed) + # so we only update dbinfos + pkl_files = ['kitti_dbinfos_train.pkl'] + update_outdoor_dbinfos( + root_dir=args.root_dir, out_dir=args.out_dir, pkl_files=pkl_files) + elif args.dataset == 'nuscenes': + # nuScenes infos is in LIDAR coord sys (changed) + # nuScenes dbinfos is in LIDAR coord sys (changed) + # so we update both infos and dbinfos + pkl_files = ['nuscenes_infos_val.pkl'] + if args.version != 'v1.0-mini': + pkl_files.append('nuscenes_infos_train.pkl') + else: + pkl_files.append('nuscenes_infos_train_tiny.pkl') + update_nuscenes_or_lyft_infos( + root_dir=args.root_dir, out_dir=args.out_dir, pkl_files=pkl_files) + if args.version != 'v1.0-mini': + pkl_files = ['nuscenes_dbinfos_train.pkl'] + update_outdoor_dbinfos( + root_dir=args.root_dir, + out_dir=args.out_dir, + pkl_files=pkl_files) + elif args.dataset == 'lyft': + # Lyft infos is in LIDAR coord sys (changed) + # Lyft has no dbinfos + # so we update infos + pkl_files = ['lyft_infos_train.pkl', 'lyft_infos_val.pkl'] + update_nuscenes_or_lyft_infos( + root_dir=args.root_dir, out_dir=args.out_dir, pkl_files=pkl_files) + elif args.dataset == 'waymo': + # Waymo infos is in CAM coord sys (unchanged) + # Waymo dbinfos is in LIDAR coord sys (changed) + # so we only update dbinfos + pkl_files = ['waymo_dbinfos_train.pkl'] + update_outdoor_dbinfos( + root_dir=args.root_dir, out_dir=args.out_dir, pkl_files=pkl_files) + elif args.dataset == 'scannet': + # ScanNet infos is in DEPTH coord sys (changed) + # but bbox is without yaw + # so ScanNet is unaffected + pass + elif args.dataset == 's3dis': + # Segmentation datasets are not affected + pass + elif args.dataset == 'sunrgbd': + # SUNRGBD infos is in DEPTH coord sys (changed) + # and bbox is with yaw + # so we update infos + pkl_files = ['sunrgbd_infos_train.pkl', 'sunrgbd_infos_val.pkl'] + update_sunrgbd_infos( + root_dir=args.root_dir, out_dir=args.out_dir, pkl_files=pkl_files) diff --git a/tools/update_data_coords.sh b/tools/update_data_coords.sh new file mode 100644 index 0000000000..bd8db62838 --- /dev/null +++ b/tools/update_data_coords.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +set -x +export PYTHONPATH=`pwd`:$PYTHONPATH + +PARTITION=$1 +DATASET=$2 +GPUS=${GPUS:-1} +GPUS_PER_NODE=${GPUS_PER_NODE:-1} +SRUN_ARGS=${SRUN_ARGS:-""} +JOB_NAME=update_data_coords + +srun -p ${PARTITION} \ + --job-name=${JOB_NAME} \ + --gres=gpu:${GPUS_PER_NODE} \ + --ntasks=${GPUS} \ + --ntasks-per-node=${GPUS_PER_NODE} \ + --kill-on-bad-exit=1 \ + ${SRUN_ARGS} \ + python -u tools/update_data_coords.py ${DATASET} \ + --root-dir ./data/${DATASET} \ + --out-dir ./data/${DATASET}