diff --git a/DATA.md b/DATA.md index 643b538..b4116b1 100644 --- a/DATA.md +++ b/DATA.md @@ -10,6 +10,9 @@ We list the available data used in the current version of CrossOver in the table | ------------ | ----------------------------- | ----------------------------------- | -------------------------- | -------------------------- | | ScanNet | `[point, rgb, cad, referral]` | `[point, rgb, floorplan, referral]` | ❌ | ✅ | | 3RScan | `[point, rgb, referral]` | `[point, rgb, referral]` | ✅ | ✅ | +| ARKitScenes | `[point, rgb, referral]` | `[point, rgb, referral]` | ❌ | ✅ | +| MultiScan | `[point, rgb, referral]` | `[point, rgb, referral]` | ❌ | ✅ | +| Structured3D | `[point, rgb, referral]` | `[point, rgb, referral, floorplan]` | ❌ | ❌ | We detail data download and release instructions for preprocessing with scripts for ScanNet + 3RScan. @@ -110,4 +113,96 @@ Scan3R/ | │ ├── objectsDataMultimodal.pt -> object data combined from data1D.pt + data2D.pt + data3D.pt (for easier loading) | │ └── sel_cams_on_mesh.png (visualisation of the cameras selected for computing RGB features per scan) | └── ... -``` \ No newline at end of file +``` +### MultiScan + +#### Running preprocessing scripts +Adjust the path parameters of `MultiScan` in the config files under `configs/preprocess`. Run the following (after changing the `--config-path` in the bash file): + +```bash +$ bash scripts/preprocess/process_multiscan.sh +``` + +Our script for MultiScan dataset performs the following additional processing: + +- 3D-to-2D projection for 2D segmentation and stores as `gt-projection-seg.pt` for each scan. + +Post running preprocessing, the data structure should look like the following: + +``` +MultiScan/ +├── objects_chunked/ (object data chunked into hdf5 format for instance baseline training) +| ├── train_objects.h5 +| └── val_objects.h5 +├── scans/ +| ├── scene_00000_00/ +| │ ├── gt-projection-seg.pt -> 3D-to-2D projected data consisting of framewise 2D instance segmentation +| │ ├── data1D.pt -> all 1D data + encoded (object referrals + BLIP features) +| │ ├── data2D.pt -> all 2D data + encoded (RGB + floorplan + DinoV2 features) +| │ ├── data2D_all_images.pt (RGB features of every image of every scan) +| │ ├── data3D.pt -> all 3D data + encoded (Point Cloud + I2PMAE features - object only) +| │ ├── object_id_to_label_id_map.pt -> Instance ID to NYU40 Label mapped +| │ ├── objectsDataMultimodal.pt -> object data combined from data1D.pt + data2D.pt + data3D.pt (for easier loading) +| │ └── sel_cams_on_mesh.png (visualisation of the cameras selected for computing RGB features per scan) +| └── ... +``` + +### ARKitScenes + +#### Running preprocessing scripts +Adjust the path parameters of `ARKitScenes` in the config files under `configs/preprocess`. Run the following (after changing the `--config-path` in the bash file): + +```bash +$ bash scripts/preprocess/process_arkit.sh +``` + +Our script for ARKitScenes dataset performs the following additional processing: + +- 3D-to-2D projection for 2D segmentation and stores as `gt-projection-seg.pt` for each scan. + +Post running preprocessing, the data structure should look like the following: + +``` +ARKitScenes/ +├── objects_chunked/ (object data chunked into hdf5 format for instance baseline training) +| ├── train_objects.h5 +| └── val_objects.h5 +├── scans/ +| ├── 40753679/ +| │ ├── gt-projection-seg.pt -> 3D-to-2D projected data consisting of framewise 2D instance segmentation +| │ ├── data1D.pt -> all 1D data + encoded (object referrals + BLIP features) +| │ ├── data2D.pt -> all 2D data + encoded (RGB + floorplan + DinoV2 features) +| │ ├── data2D_all_images.pt (RGB features of every image of every scan ) +| │ ├── data3D.pt -> all 3D data + encoded (Point Cloud + I2PMAE features - object only) +| │ ├── object_id_to_label_id_map.pt -> Instance ID to NYU40 Label mapped +| │ ├── objectsDataMultimodal.pt -> object data combined from data1D.pt + data2D.pt + data3D.pt (for easier loading) +| │ └── sel_cams_on_mesh.png (visualisation of the cameras selected for computing RGB features per scan) +| └── ... +``` + + +### Structured3D + +#### Running preprocessing scripts +Adjust the path parameters of `Structured3D` in the config files under `configs/preprocess`. Run the following (after changing the `--config-path` in the bash file): + +```bash +$ bash scripts/preprocess/process_structured3d.sh +``` + + +``` +Structured3D/ +├── objects_chunked/ (object data chunked into hdf5 format for instance baseline training) +| ├── train_objects.h5 +| └── val_objects.h5 +├── scans/ +| ├── scene_00000_490854/ +| │ ├── gt-projection-seg.npz -> framewise 2D instance segmentation +| │ ├── data1D.npz -> all 1D data + encoded (object referrals + BLIP features) +| │ ├── data2D.npz -> all 2D data + encoded (RGB + floorplan + DinoV2 features) +| │ ├── data3D.npz -> all 3D data + encoded (Point Cloud + I2PMAE features - object only) +| │ ├── object_id_to_label_id_map.npz -> Instance ID to NYU40 Label mapped +| │ ├── objectsDataMultimodal.npz -> object data combined from data1D.npz + data2D.npz + data3D.npz (for easier loading) +| └── ... +``` diff --git a/README.md b/README.md index ab1f152..b51cd93 100644 --- a/README.md +++ b/README.md @@ -120,6 +120,10 @@ See [DATA.MD](DATA.md) for detailed instructions on data download, preparation a | ------------ | ----------------------------- | ----------------------------------- | -------------------------- | -------------------------- | | Scannet | `[point, rgb, cad, referral]` | `[point, rgb, floorplan, referral]` | ❌ | ✅ | | 3RScan | `[point, rgb, referral]` | `[point, rgb, referral]` | ✅ | ✅ | +| ARKitScenes | `[point, rgb, referral]` | `[point, rgb, referral]` | ❌ | ✅ | +| MultiScan | `[point, rgb, referral]` | `[point, rgb, referral]` | ❌ | ✅ | +| Structured3D | `[point, rgb, referral]` | `[point, rgb, referral, floorplan]` | ❌ | ❌ | + > To run our demo, you only need to download generated embedding data; no need for any data preprocessing. @@ -136,7 +140,7 @@ Various configurable parameters: - `--database_path`: Path to the precomputed embeddings of the database scenes downloaded before (eg: `./release_data/embed_scannet.pt`). - `--query_modality`: Modality of the query scene, Options: `point`, `rgb`, `floorplan`, `referral` - `--database_modality`: Modality used for retrieval. Same options as above. -- `--ckpt`: Path to the pre-trained scene crossover model checkpoint (details [here](#checkpoints)), example_path: `./checkpoints/scene_crossover_scannet+scan3r.pth/`). +- `--ckpt`: Path to the pre-trained scene crossover model checkpoint (details [here](#checkpoints)), example_path: `./checkpoints/scene_crossover_scannet+scan3r.pth/`. For embedding and pre-trained model download, refer to [generated embedding data](DATA.md#generated-embedding-data) and [checkpoints](#checkpoints) sections. diff --git a/TRAIN.md b/TRAIN.md index ffa4938..337cdef 100644 --- a/TRAIN.md +++ b/TRAIN.md @@ -21,7 +21,7 @@ $ bash scripts/train/train_instance_crossover.sh ``` #### Train Scene Retrieval Pipeline -Adjust path/configuration parameters in `configs/train/train_scene_crossover.yaml`. You can also add your customised dataset or choose to train on Scannet & 3RScan or either. Run the following: +Adjust path/configuration parameters in `configs/train/train_scene_crossover.yaml`. You can also add your customised dataset or choose to train on Scannet, 3RScan, MultiScan, & ARKitScenes or any combination of the same. Run the following: ```bash $ bash scripts/train/train_scene_crossover.sh diff --git a/common/load_utils.py b/common/load_utils.py index cd06a54..e707670 100644 --- a/common/load_utils.py +++ b/common/load_utils.py @@ -8,6 +8,7 @@ import torch import random import numpy as np +import yaml def make_dir(dir_path: str) -> None: """Creates a directory if it does not exist.""" @@ -43,6 +44,12 @@ def load_json(filename: str) -> Any: file.close() return data +def load_yaml(filepath): + with open(filepath) as f: + # file = yaml.load(f, Loader=Loader) + file = yaml.safe_load(f) + return file + def write_json(data_dict: Any, filename: str) -> None: """Writes data to a JSON file with indentation.""" json_obj = json.dumps(data_dict, indent=4) @@ -50,6 +57,20 @@ def write_json(data_dict: Any, filename: str) -> None: with open(filename, "w") as outfile: outfile.write(json_obj) +def load_npz_as_dict(filename: str) -> dict: + with np.load(filename, allow_pickle=True) as npz: + if isinstance(npz, np.lib.npyio.NpzFile): + out = {} + for k in npz.files: + val = npz[k] + if (isinstance(val, np.ndarray) and + val.dtype == object and + val.shape == ()): + out[k] = val.item() + else: + out[k] = val + return out + def get_print_format(value: Any) -> str: """Determines the appropriate format string for a given value.""" if isinstance(value, int): diff --git a/configs/evaluation/eval_instance.yaml b/configs/evaluation/eval_instance.yaml index a14c626..eef264c 100644 --- a/configs/evaluation/eval_instance.yaml +++ b/configs/evaluation/eval_instance.yaml @@ -33,7 +33,7 @@ data : voxel_size : 0.02 Scan3R: - base_dir : /drive/datasets/Scan3R/ + base_dir : /media/sayan/internal/datasets/Scan3R/ process_dir : ${data.process_dir}/Scan3R/ processor3D : Scan3R3DProcessor processor2D : Scan3R2DProcessor @@ -43,14 +43,33 @@ data : max_object_len : 150 voxel_size : 0.02 + ARKitScenes: + base_dir : /media/sayan/internal/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + avail_modalities : ['point', 'cad', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + + MultiScan: + base_dir : /media/sayan/internal/datasets/MultiScan + process_dir : ${data.process_dir}/MultiScan + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor + avail_modalities : ['point', 'cad', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + task: name : InferenceObjectRetrieval InferenceObjectRetrieval: val : [Scannet] modalities : ['rgb', 'point', 'cad', 'referral'] scene_modalities : ['rgb', 'point', 'referral', 'floorplan'] - ckpt_path : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r.pth - + ckpt_path : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r+multiscan.pth inference_module: ObjectRetrieval diff --git a/configs/evaluation/eval_scene.yaml b/configs/evaluation/eval_scene.yaml index 0f1b6f2..fc19a4e 100644 --- a/configs/evaluation/eval_scene.yaml +++ b/configs/evaluation/eval_scene.yaml @@ -33,7 +33,7 @@ data : voxel_size : 0.02 Scan3R: - base_dir : /drive/datasets/Scan3R/ + base_dir : /media/sayan/internal/datasets/Scan3R/ process_dir : ${data.process_dir}/Scan3R/ processor3D : Scan3R3DProcessor processor2D : Scan3R2DProcessor @@ -43,13 +43,32 @@ data : max_object_len : 150 voxel_size : 0.02 + ARKitScenes: + base_dir : /media/sayan/internal/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + max_object_len : 150 + voxel_size : 0.02 + avail_modalities : ['point', 'cad', 'rgb', 'referral'] + MultiScan: + base_dir : /media/sayan/internal/datasets/MultiScan + process_dir : ${data.process_dir}/MultiScan + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor + avail_modalities : ['point', 'cad', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + task: name : InferenceSceneRetrieval InferenceSceneRetrieval: val : [Scannet] modalities : ['rgb', 'point', 'cad', 'referral'] scene_modalities : ['rgb', 'point', 'referral', 'floorplan'] #, 'point'] - ckpt_path : /drive/dumps/multimodal-spaces/runs/release_runs/scene_crossover_scannet+scan3r.pth + ckpt_path : /drive/dumps/multimodal-spaces/runs/release_runs/scene_crossover_scannet+scan3r+multiscan.pth inference_module: SceneRetrieval model: diff --git a/configs/preprocess/process_1d.yaml b/configs/preprocess/process_1d.yaml index c74b6bc..c12deec 100644 --- a/configs/preprocess/process_1d.yaml +++ b/configs/preprocess/process_1d.yaml @@ -17,7 +17,7 @@ data: aggre_subfix : _vh_clean.aggregation.json Scan3R: - base_dir : /drive/datasets/Scan3R/ + base_dir : /media/sayan/internal/datasets/Scan3R/ process_dir : ${data.process_dir}/Scan3R/ processor3D : Scan3R3DProcessor processor2D : Scan3R2DProcessor @@ -25,6 +25,28 @@ data: label_filename : labels.instances.align.annotated.v2.ply skip_frames : 1 + ARKitScenes: + base_dir : /media/sayan/internal/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + skip_frames : 1 + MultiScan: + base_dir : /media/sayan/internal/datasets/MultiScan + process_dir : ${data.process_dir}/MultiScan + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor + skip_frames : 1 + + Structured3D: + base_dir : /media/sayan/internal/datasets/Structured3D/ + process_dir : ${data.process_dir}/Structured3D/scans + processor3D : Structured3D_3DProcessor + processor2D : Structured3D_2DProcessor + processor1D : Structured3D_1DProcessor + Shapenet: base_dir : /drive/datasets/Shapenet/ShapeNetCore.v2/ diff --git a/configs/preprocess/process_2d.yaml b/configs/preprocess/process_2d.yaml index 74898cd..bc88740 100644 --- a/configs/preprocess/process_2d.yaml +++ b/configs/preprocess/process_2d.yaml @@ -19,7 +19,7 @@ data: skip_frames : 5 Scan3R: - base_dir : /drive/datasets/Scan3R/ + base_dir : /media/sayan/internal/datasets/Scan3R/ process_dir : ${data.process_dir}/Scan3R/ processor3D : Scan3R3DProcessor processor2D : Scan3R2DProcessor @@ -27,6 +27,29 @@ data: label_filename : labels.instances.align.annotated.v2.ply skip_frames : 1 + ARKitScenes: + base_dir : /media/sayan/internal/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + skip_frames : 1 + + MultiScan: + base_dir : /media/sayan/internal/datasets/MultiScan + process_dir : ${data.process_dir}/MultiScan + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor + skip_frames : 1 + + Structured3D: + base_dir : /media/sayan/internal/datasets/Structured3D/ + process_dir : ${data.process_dir}/Structured3D/scans + processor3D : Structured3D_3DProcessor + processor2D : Structured3D_2DProcessor + processor1D : Structured3D_1DProcessor + modality_info: 1D : feature_extractor: @@ -60,4 +83,4 @@ task: name : Preprocess Preprocess : modality : '2D' - splits : ['val'] \ No newline at end of file + splits : ['train', 'val'] \ No newline at end of file diff --git a/configs/preprocess/process_3d.yaml b/configs/preprocess/process_3d.yaml index 3d15f23..76b9744 100644 --- a/configs/preprocess/process_3d.yaml +++ b/configs/preprocess/process_3d.yaml @@ -12,18 +12,40 @@ data: layout_dir : /drive/datasets/SceneCAD/ process_dir : ${data.process_dir}/Scannet/ processor3D : Scannet3DProcessor - mesh_subfix : _vh_clean_2.labels.ply + mesh_subfix : _vh_clean_2.ply seg_subfix : _vh_clean_2.0.010000.segs.json aggre_subfix : _vh_clean.aggregation.json Scan3R: - base_dir : /drive/datasets/Scan3R/ + base_dir : /media/sayan/internal/datasets/Scan3R/ process_dir : ${data.process_dir}/Scan3R/ processor3D : Scan3R3DProcessor processor2D : Scan3R2DProcessor processor1D : Scan3R1DProcessor label_filename : labels.instances.align.annotated.v2.ply + ARKitScenes: + base_dir : /media/sayan/internal/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + + MultiScan: + base_dir : /media/sayan/internal/datasets/MultiScan + process_dir : ${data.process_dir}/MultiScan + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor + skip_frames : 1 + + Structured3D: + base_dir : /media/sayan/internal/datasets/Structured3D/ + process_dir : ${data.process_dir}/Structured3D/scans + processor3D : Structured3D_3DProcessor + processor2D : Structured3D_2DProcessor + processor1D : Structured3D_1DProcessor + modality_info: 1D : feature_extractor: diff --git a/configs/preprocess/process_multimodal.yaml b/configs/preprocess/process_multimodal.yaml index 3eb5ace..ea84c77 100644 --- a/configs/preprocess/process_multimodal.yaml +++ b/configs/preprocess/process_multimodal.yaml @@ -18,7 +18,7 @@ data: avail_modalities : ['point', 'cad', 'rgb', 'referral'] Scan3R: - base_dir : /drive/datasets/Scan3R + base_dir : /media/sayan/internal/datasets/Scan3R process_dir : ${data.process_dir}/Scan3R chunked_dir : ${data.process_dir}/Scan3R/objects_chunked/ processor3D : Scan3R3DProcessor @@ -28,6 +28,31 @@ data: skip_frames : 1 avail_modalities : ['point', 'rgb', 'referral'] + ARKitScenes: + base_dir : /media/sayan/internal/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ + chunked_dir : ${data.process_dir}/ARKitScenes/objects_chunked + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + avail_modalities : ['point', 'rgb', 'referral'] + + MultiScan: + base_dir : /media/sayan/internal/datasets/MultiScan + process_dir : ${data.process_dir}/MultiScan/ + chunked_dir : ${data.process_dir}/MultiScan/objects_chunked + processor3D : Scan3R3DProcessor + processor2D : Scan3R2DProcessor + processor1D : Scan3R1DProcessor + avail_modalities : ['point', 'rgb', 'referral'] + + Structured3D: + base_dir : /media/sayan/internal/datasets/Structured3D/ + process_dir : ${data.process_dir}/Structured3D/scans + processor3D : Structured3D_3DProcessor + processor2D : Structured3D_2DProcessor + processor1D : Structured3D_1DProcessor + modality_info: 1D : feature_extractor: diff --git a/configs/train/train_instance_baseline.yaml b/configs/train/train_instance_baseline.yaml index 8b6bc89..bd630d5 100644 --- a/configs/train/train_instance_baseline.yaml +++ b/configs/train/train_instance_baseline.yaml @@ -44,6 +44,27 @@ data : max_object_len : 150 voxel_size : 0.02 + ARKitScenes: + base_dir : /media/sayan/Expansion/data/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ + chunked_dir : ${data.process_dir}/ARKitScenes/objects_chunked + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + avail_modalities : ['point', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + MultiScan: + base_dir : /media/sayan/Expansion/data/datasets/MultiScan + process_dir : ${data.process_dir}/MultiScan/ + chunked_dir : ${data.process_dir}/MultiScan/objects_chunked + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor + avail_modalities : ['point', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + task: name : ObjectLevelGrounding ObjectLevelGrounding : diff --git a/configs/train/train_instance_crossover.yaml b/configs/train/train_instance_crossover.yaml index c54257d..b93ab60 100644 --- a/configs/train/train_instance_crossover.yaml +++ b/configs/train/train_instance_crossover.yaml @@ -33,7 +33,7 @@ data : voxel_size : 0.02 Scan3R: - base_dir : /drive/datasets/Scan3R/ + base_dir : /media/sayan/internal/datasets/Scan3R/ process_dir : ${data.process_dir}/Scan3R/ chunked_dir : ${data.process_dir}/Scan3R/objects_chunked/ processor3D : Scan3R3DProcessor @@ -44,12 +44,33 @@ data : max_object_len : 150 voxel_size : 0.02 + ARKitScenes: + base_dir : /media/sayan/internal/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ + chunked_dir : ${data.process_dir}/ARKitScenes/objects_chunked + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + avail_modalities : ['point', 'cad', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + MultiScan: + base_dir : /media/sayan/internal/datasets/MultiScan + process_dir : ${data.process_dir}/MultiScan/ + chunked_dir : ${data.process_dir}/MultiScan/objects_chunked + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor + avail_modalities : ['point', 'cad', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + task: name : SceneLevelGrounding SceneLevelGrounding : modalities : ['rgb', 'point', 'cad', 'referral'] - train : [Scannet, Scan3R] - val : [Scannet, Scan3R] + train : [Scannet, Scan3R, MultiScan, ARKitScenes] + val : [Scannet, Scan3R, MultiScan, ARKitScenes] trainer: GroundingTrainer diff --git a/configs/train/train_scene_crossover.yaml b/configs/train/train_scene_crossover.yaml index f9459da..4f75d80 100644 --- a/configs/train/train_scene_crossover.yaml +++ b/configs/train/train_scene_crossover.yaml @@ -33,7 +33,7 @@ data : voxel_size : 0.02 Scan3R: - base_dir : /drive/datasets/Scan3R/ + base_dir : /media/sayan/internal/datasets/Scan3R/ process_dir : ${data.process_dir}/Scan3R/ chunked_dir : ${data.process_dir}/Scan3R/objects_chunked processor3D : Scan3R3DProcessor @@ -44,14 +44,35 @@ data : max_object_len : 150 voxel_size : 0.02 + ARKitScenes: + base_dir : /media/sayan/internal/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ + chunked_dir : ${data.process_dir}/ARKitScenes/objects_chunked + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + avail_modalities : ['point', 'cad', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + MultiScan: + base_dir : /media/sayan/internal/datasets/MultiScan + process_dir : ${data.process_dir}/MultiScan/ + chunked_dir : ${data.process_dir}/MultiScan/objects_chunked + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor + avail_modalities : ['point', 'cad', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + task: name : UnifiedTrain UnifiedTrain : modalities : ['rgb', 'point', 'cad', 'referral'] scene_modalities : ['rgb', 'point', 'floorplan', 'referral'] - train : [Scannet, Scan3R, MultiScan] - val : [Scannet, Scan3R, MultiScan] - object_enc_ckpt : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r+multiscan.pth + train : [Scannet, Scan3R, MultiScan, ARKitScenes] + val : [Scannet, Scan3R, MultiScan, ARKitScenes] + object_enc_ckpt : /drive/dumps/multimodal-spaces/runs/new_runs/instance_crossover_scannet+scan3r+multiscan+arkitscenes.pth trainer: UnifiedTrainer diff --git a/data/datasets/__init__.py b/data/datasets/__init__.py index 9a1b744..0e3df30 100644 --- a/data/datasets/__init__.py +++ b/data/datasets/__init__.py @@ -1,2 +1,5 @@ from .scannet import * -from .scan3r import * \ No newline at end of file +from .scan3r import * +from .arkit import * +from .multiscan import * +from .structured3d import * diff --git a/data/datasets/arkit.py b/data/datasets/arkit.py new file mode 100644 index 0000000..4944dae --- /dev/null +++ b/data/datasets/arkit.py @@ -0,0 +1,41 @@ +import os.path as osp +import numpy as np +from typing import List, Any +from omegaconf import DictConfig +import pandas as pd +from ..build import DATASET_REGISTRY +from .scanbase import ScanObjectBase, ScanBase + +@DATASET_REGISTRY.register() +class ARKitScenesObject(ScanObjectBase): + """ARKitScenes dataset class for instance level baseline""" + def __init__(self, data_config: DictConfig, split: str) -> None: + super().__init__(data_config, split) + +@DATASET_REGISTRY.register() +class ARKitScenes(ScanBase): + """ARKitScenes dataset class""" + def __init__(self, data_config: DictConfig, split: str) -> None: + super().__init__(data_config, split) + + filepath = osp.join(self.files_dir, '{}_scans.txt'.format(self.split)) + self.scan_ids = np.genfromtxt(filepath, dtype = str) + + def get_temporal_scan_pairs(self): + """Groups scans into temporal pairs based on shared visit_id.""" + csv_path=osp.join(self.files_dir,'3dod_train_val_splits.csv') + df = pd.read_csv(csv_path) + + df = df[df["visit_id"].notna()] + + grouped_scans = df.groupby("visit_id")["video_id"].apply(list).to_dict() + + scene_pairs = [] + for video_ids in grouped_scans.values(): + if len(video_ids) > 1: + ref_scan_id = video_ids[0] # First video_id as reference + rescan_list = [{"scan_id": rescan_id} for rescan_id in video_ids[1:]] + + scene_pairs.append([ref_scan_id, rescan_list]) + + return scene_pairs \ No newline at end of file diff --git a/data/datasets/multiscan.py b/data/datasets/multiscan.py new file mode 100644 index 0000000..a43d8a1 --- /dev/null +++ b/data/datasets/multiscan.py @@ -0,0 +1,42 @@ +import os.path as osp +import numpy as np +from typing import List, Any +from omegaconf import DictConfig + +from ..build import DATASET_REGISTRY +from .scanbase import ScanObjectBase, ScanBase + +@DATASET_REGISTRY.register() +class MultiScanObject(ScanObjectBase): + """MultiScan dataset class for instance level baseline""" + def __init__(self, data_config: DictConfig, split: str) -> None: + super().__init__(data_config, split) + +@DATASET_REGISTRY.register() +class MultiScan(ScanBase): + """MultiScan dataset class""" + def __init__(self, data_config: DictConfig, split: str) -> None: + super().__init__(data_config, split) + + filepath = osp.join(self.files_dir, '{}_scans.txt'.format(self.split)) + self.scan_ids = np.genfromtxt(filepath, dtype = str) + + def get_temporal_scan_pairs(self) -> List[List[Any]]: + """Gets pairs of temporal scans from the dataset.""" + scene_pairs = [] + + ref_scan_ids = [scan_id for scan_id in self.scan_ids if scan_id.endswith('00')] + + for ref_scan_id in ref_scan_ids: + rescan_list = [] + + for rescan_id in self.scan_ids: + rescan = {} + if rescan_id.startswith(ref_scan_id.split('_')[0]) and rescan_id != ref_scan_id: + rescan['scan_id'] = rescan_id + rescan_list.append(rescan) + if len(rescan_list) == 0: + continue + + scene_pairs.append([ref_scan_id, rescan_list]) + return scene_pairs \ No newline at end of file diff --git a/data/datasets/scanbase.py b/data/datasets/scanbase.py index 7f8d3fe..c1ed3b3 100644 --- a/data/datasets/scanbase.py +++ b/data/datasets/scanbase.py @@ -10,8 +10,12 @@ from omegaconf import DictConfig from typing import List, Dict, Any +from common.load_utils import load_npz_as_dict from ..transforms import get_transform from ..data_utils import pad_tensors +from common.load_utils import load_yaml +import albumentations as A + class ScanObjectBase(Dataset): """Base Dataset class for instance level training""" @@ -99,6 +103,14 @@ def __init__(self, data_config: DictConfig, split: str) -> None: self.split = split self.files_dir = osp.join(data_config.base_dir, 'files') + self.color_mean_std_path = osp.join(self.process_dir, 'color_mean_std.yaml') + self.color_mean_std = load_yaml(self.color_mean_std_path) + color_mean, color_std = ( + tuple(self.color_mean_std["mean"]), + tuple(self.color_mean_std["std"]), + ) + self.normalize_color = A.Normalize(mean=color_mean, std=color_std) + self.max_obj_len = data_config.max_object_len self.modalities = data_config.avail_modalities self.voxel_size = data_config.voxel_size @@ -131,16 +143,17 @@ def __getitem__(self, index: int) -> Dict[str, Any]: scan_process_dir = osp.join(self.process_dir, 'scans', scan_id) - scan_objects_data = torch.load(osp.join(scan_process_dir, 'objectsDataMultimodal.pt')) - - scandata_1d = torch.load(osp.join(scan_process_dir, 'data1D.pt')) - scandata_2d = torch.load(osp.join(scan_process_dir, 'data2D.pt')) - scandata_3d = torch.load(osp.join(scan_process_dir, 'data3D.pt')) + scan_objects_data = load_npz_as_dict(osp.join(scan_process_dir, 'objectsDataMultimodal.npz')) + scandata_1d = load_npz_as_dict(osp.join(scan_process_dir, 'data1D.npz')) + scandata_2d = load_npz_as_dict(osp.join(scan_process_dir, 'data2D.npz')) + scandata_3d = load_npz_as_dict(osp.join(scan_process_dir, 'data3D.npz')) # Point Cloud Data -- Scene points, feats, scene_label = scandata_3d['scene']['pcl_coords'], scandata_3d['scene']['pcl_feats'], scandata_3d['scene']['scene_label'] - feats /= 255. - feats -= 0.5 + pseudo_image = feats.astype(np.uint8)[np.newaxis, :, :] + feats = np.squeeze(self.normalize_color(image=pseudo_image)["image"]) + # feats /= 255. + # feats -= 0.5 if scene_label is None: scene_label = 'NA' @@ -152,9 +165,9 @@ def __getitem__(self, index: int) -> Dict[str, Any]: _, sel = ME.utils.sparse_quantize(points / self.voxel_size, return_index=True) coords, feats = points[sel], feats[sel] - # Get coords, shift to center + # Get coords coords = np.floor(coords / self.voxel_size) - coords-=coords.min(0) + coords -= coords.min(0) # Object Data scene_dict = {} diff --git a/data/datasets/structured3d.py b/data/datasets/structured3d.py new file mode 100644 index 0000000..2b73b41 --- /dev/null +++ b/data/datasets/structured3d.py @@ -0,0 +1,23 @@ +import os.path as osp +import numpy as np +from typing import List, Any +from omegaconf import DictConfig + +from ..build import DATASET_REGISTRY +from .scanbase import ScanObjectBase, ScanBase + +@DATASET_REGISTRY.register() +class Structured3DObject(ScanObjectBase): + """Structured3D dataset class for instance level baseline""" + def __init__(self, data_config: DictConfig, split: str) -> None: + super().__init__(data_config, split) + +@DATASET_REGISTRY.register() +class Structured3D(ScanBase): + """Structured3D dataset class""" + def __init__(self, data_config: DictConfig, split: str) -> None: + super().__init__(data_config, split) + + filepath = osp.join(self.files_dir, '{}_scans.txt'.format(self.split)) + self.scan_ids = np.genfromtxt(filepath, dtype = str) + \ No newline at end of file diff --git a/prepare_data/README.md b/prepare_data/README.md index dba34f5..5714596 100644 --- a/prepare_data/README.md +++ b/prepare_data/README.md @@ -5,6 +5,9 @@ This document provides instructions for pre-processing different datasets, including - ScanNet - 3RScan +- ARKitScenes +- MultiScan +- Structured3D ## Prerequisites @@ -16,20 +19,19 @@ Before you begin, simply activate the `crossover` conda environment. #### Original Data - **ScanNet**: Download ScanNet v2 data from the [official website](https://github.com/ScanNet/ScanNet), we use the official training and validation split from [here](https://github.com/ScanNet/ScanNet/tree/master/Tasks/Benchmark). -- **3RScan**: Download 3RScan dataset from the [official website](https://github.com/WaldJohannaU/3RScan), we use the official (full list of scan ids including reference + rescans) training split from [here](https://campar.in.tum.de/public_datasets/3RScan/train_scans.txt) and validation split from [here](https://campar.in.tum.de/public_datasets/3RScan/val_scans.txt). - - Download `3RScan.json` from [here](https://campar.in.tum.de/public_datasets/3RScan/3RScan.json) and `objects.json` from [here](https://campar.in.tum.de/public_datasets/3DSSG/3DSSG/objects.json). - - Download the class mapping file `3RScan.v2 Semantic Classes - Mapping.csv` from [here](https://docs.google.com/spreadsheets/d/1eRTJ2M9OHz7ypXfYD-KTR1AIT-CrVLmhJf8mxgVZWnI/edit?gid=0#gid=0). +- **3RScan**: Download 3RScan dataset from the [official website](https://github.com/WaldJohannaU/3RScan). + +- **MultiScan**: Download MultiScan dataset from the [official website](https://github.com/smartscenes/multiscan). + +- **ARKitScenes**: Download ARKitScenes dataset from the [official website](https://github.com/apple/ARKitScenes). - **ShapeNet**: Download ShapenetCore dataset from the [official Huggingface release](https://huggingface.co/datasets/ShapeNet/ShapeNetCore) and unzip. -#### Referral and CAD annotations -We use [SceneVerse](https://scene-verse.github.io/) for instance referrals (ScanNet & 3RScan) and [Scan2CAD](https://github.com/skanti/Scan2CAD) for CAD annotations (ScanNet). +- **Structured3D**: Download Structured3D dataset from the [official website](https://github.com/bertjiazheng/Structured3D). -- **SceneVerse** - Download the Scannet and 3RScan data under `annotations/refer` from the [official website](https://scene-verse.github.io/). -- **Scan2CAD** - Download `full_annotations.json` from the [official website](https://github.com/skanti/Scan2CAD?tab=readme-ov-file#download-dataset). +### Download Referral and CAD annotations +We use [SceneVerse](https://scene-verse.github.io/) for instance referrals (ScanNet, 3RScan, MultiScan, & ARKitScenes) and [Scan2CAD](https://github.com/skanti/Scan2CAD) for CAD annotations (ScanNet). Exact instructions for data setup below. -### Prepare The Data -Exact instructions for data setup + preparation below: #### ScanNet 1. Run the following to extract ScanNet data @@ -107,3 +109,142 @@ Scan3R/ └── sceneverse └── ssg_ref_rel2_template.json ``` + +#### ARKitScenes +1. Download ARKitScenes 3dod data using the following command: + +```bash +python ARKitScenes/download_data.py 3dod --video_id_csv PATH_TO_3dod_train_val_splits.csv --download_dir PATH_TO_ARKITSCENES +``` +The files mentioned in the above command - ```download_data.py``` and ```3dod_train_val_splits.csv``` can be found in the official repository [here](https://github.com/apple/ARKitScenes), along with more detailed instructions and descriptions of the data. + +2. Once the data is downloaded, run the following to organize it as per our requirements. + + ```bash +cd ARKitScenes +mv 3dod/Training/* scans +mv 3dod/Validation/* scans +``` + +3. Move the relevant files from `Sceneverse` and `ARKitScenes` under `files/`. + +Once completed, the data structure would look like the following: +``` +ARKitScenes/ +├── scans/ +│ ├── 40753679/ +│ │ ├── 40753679_frames/ +│ │ │ ├── lowres_depth/ (folder containing depth images) +│ │ │ ├── lowres_wide/ (folder containing rgb images) +│ │ │ ├── lowres_wide_intrinsics/ (folder containing frame wise camera intrinsics) +│ │ │ ├── lowres_wide.traj (camera trajectory) +│ │ ├── 40753679_3dod_annotation.json +│ │ ├── 40753679_3dod_mesh.ply +| └── +└── files + ├── scannetv2-labels.combined.tsv + ├── train_scans.txt + ├── val_scans.txt + ├── metadata.csv + ├── 3dod_train_val_splits.csv + └── sceneverse + └── ssg_ref_rel2_template.json +``` + +#### MultiScan +1. Download MultiScan data into MultiScan/scenes and run the following to extract MultiScan data + + ```bash +cd MultiScan/scenes +unzip '*.zip' +rm -rf '*.zip' +``` +3. To generate sequence of RGB images and corresponding camera poses from the ```.mp4``` file, run the follwing +```bash +cd prepare_data/multiscan +python preprocess_2d_multiscan.py --base_dir PATH_TO_MULTISCAN --frame_interval {frame_interval} +``` +Once completed, the data structure would look like the following: +``` +MultiScan/ +├── scenes/ +│ ├── scene_00000_00/ +│ │ ├── sequence/ (folder containing rgb images at specified frame interval) +| | ├── frame_ids.txt +│ │ ├── scene_00000_00.annotations.json +│ │ ├── scene_00000_00.jsonl +│ │ ├── scene_00000_00.confidence.zlib +│ │ ├── scene_00000_00.mp4 +│ │ ├── poses.jsonl +│ │ ├── scene_00000_00.ply +│ │ ├── scene_00000_00.align.json +│ │ ├── scene_00000_00.json +| └── +└── files + ├── scannetv2-labels.combined.tsv + ├── train_scans.txt + ├── test_scans.txt + └── sceneverse + └── ssg_ref_rel2_template.json +``` + +#### Structured3D + +1. Download Structured3D data(bbox data + perspective_full data for all non corrupt zips), run the following scripts after making path related changes in each: +```bash +bash prepare_data/structured3d/unzip_data.sh +python prepare_data/structured3d/move_data.py +``` +This should have moved all downloaded data to one folder - Structured3D. After verifying this, we move the data into a subdirectory to be in accordance with the structure our preprocessing expects using the following commands: +```bash +bash prepare_data/structured3d/move2scan.sh +``` +At this stage data should look like this: +``` +Structured3D/ +├── scans/ +│ ├── scene_00000/ +│ │ ├── 2D_rendering (remember to move perspective instance images here from bbox zip) +| | ├── annotation_3d.json +│ │ └── bbox_3d.json +``` + +2. Now, we need to generate 3d pointclouds of each room across all the scenes. To do so, run the following script: +```bash +python3 prepare_data/structured3d/generate_ply.py --base_path PATH_TO_STRUCTURED3D/SCANS +``` +This will generate directory 3D_rendering for each scan, with room_mesh.ply in separate folders for each room. + +3. We make use of referrals from sceneverse, for which we need a mapping of Structured3D object ids to Sceneverse referral target ids. We get this with the help of the following script: +```bash +python3 prepare_data/structured3d/uni3dscene.py base_dir PATH_TO STRUCTURED3D/SCANS --out_data_root PATH_TO STRUCTURED3D/uni3d_output --in_data_root PATH_TO STRUCTURED3D/SCANS +``` +3. We generate roomwise floorplans for all scenes with the following script: +```bash +python3 prepare_data/structured3d/save_floorplan.py --path PATH_TO_STRUCTURED3D/SCANS +``` + +The final data organization should look like this: +``` +Structured3D/ +├── scans/ +│ ├── scene_00000/ +│ │ ├── 2D_rendering (remember to move perspective instance images here from bbox zip) +| | ├── annotation_3d.json +│ │ └── bbox_3d.json +│ │ └── 3D_rendering +│ │ └── floorplans + +| └── ... +└── files + ├── room_types.txt + ├── train_scans.txt + ├── val_scans.txt + └── sceneverse + └── ssg_ref_rel2_template.json +└── uni3d_output + ├── annotations + ├── instance + ├── semantic_mask + └── points +``` diff --git a/prepare_data/multiscan/preprocess_2d_multiscan.py b/prepare_data/multiscan/preprocess_2d_multiscan.py new file mode 100644 index 0000000..da89da1 --- /dev/null +++ b/prepare_data/multiscan/preprocess_2d_multiscan.py @@ -0,0 +1,94 @@ +import os +import cv2 +import json +import jsonlines +import argparse +import os.path as osp +import shutil + +def process_scene_folders(base_dir, frame_interval=10): + base_dir=osp.join(base_dir, 'scenes') + scene_folders = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))] + + for scene_folder in scene_folders: + scene_path = os.path.join(base_dir, scene_folder) + video_path = os.path.join(scene_path, f"{scene_folder}.mp4") + jsonl_path = os.path.join(scene_path, f"{scene_folder}.jsonl") + frame_output_dir = os.path.join(scene_path, "sequence") + frame_ids_txt_path = os.path.join(scene_path, "frame_ids.txt") + metadata_output_path = os.path.join(scene_path, "poses.jsonl") + + if os.path.exists(frame_output_dir): + shutil.rmtree(frame_output_dir) + os.makedirs(frame_output_dir) + + if not os.path.exists(video_path): + print(f"Video file not found: {video_path}") + continue + if not os.path.exists(jsonl_path): + print(f"Metadata file not found: {jsonl_path}") + continue + + print(f"Processing scene: {scene_folder}") + + frame_ids = extract_frames_from_video(video_path, frame_output_dir, frame_interval) + + with open(frame_ids_txt_path, "w") as f: + for frame_id in frame_ids: + f.write(f"{frame_id}\n") + + selected_metadata = extract_metadata_by_line_number(jsonl_path, frame_ids) + + with jsonlines.open(metadata_output_path, mode="w") as writer: + for entry in selected_metadata: + writer.write(entry) + + print(f"Finished processing scene: {scene_folder}") + + +def extract_frames_from_video(video_path, output_dir, frame_interval): + + cap = cv2.VideoCapture(video_path) + if not cap.isOpened(): + raise ValueError(f"Could not open video file: {video_path}") + + frame_ids = [] + frame_count = 0 + + while True: + ret, frame = cap.read() + if not ret: + break # End of video + + if frame_count % frame_interval == 0: + frame_id = frame_count + frame_ids.append(frame_id) + output_path = os.path.join(output_dir, f"frame-{frame_id}.color.jpg") + cv2.imwrite(output_path, frame) # Save frame as an image + + frame_count += 1 + + cap.release() + return frame_ids + + +def extract_metadata_by_line_number(jsonl_path, line_numbers): + + selected_metadata = [] + + with jsonlines.open(jsonl_path) as reader: + for line_idx, entry in enumerate(reader): + if line_idx in line_numbers: + entry["frame_id"] = line_idx + selected_metadata.append(entry) + + return selected_metadata + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process scene folders.") + parser.add_argument("--base_dir", type=str, required=True, help="Base dataset directory.") + parser.add_argument("--frame_interval", type=int, default=10, help="Interval for saving frames.") + args = parser.parse_args() + + process_scene_folders(args.base_dir, args.frame_interval) \ No newline at end of file diff --git a/prepare_data/structured3d/generate_ply.py b/prepare_data/structured3d/generate_ply.py new file mode 100644 index 0000000..19b3cd9 --- /dev/null +++ b/prepare_data/structured3d/generate_ply.py @@ -0,0 +1,366 @@ +import os +import cv2 +import numpy as np +import open3d as o3d +from plyfile import PlyData, PlyElement +import json +import argparse +import misc.utils +BASE_PATH = "/Users/gauravpradeep/CrossOver_ScaleUp/Structured3D/scans/" + + +def create_color_palette(): + """Returns the NYU40 colormap mapping RGB to class indices.""" + return [ + (0, 0, 0), # Unlabeled (0) + (174, 199, 232), # wall (1) + (152, 223, 138), # floor (2) + (31, 119, 180), # cabinet (3) + (255, 187, 120), # bed (4) + (188, 189, 34), # chair (5) + (140, 86, 75), # sofa (6) + (255, 152, 150), # table (7) + (214, 39, 40), # door (8) + (197, 176, 213), # window (9) + (148, 103, 189), # bookshelf (10) + (196, 156, 148), # picture (11) + (23, 190, 207), # counter (12) + (178, 76, 76), + (247, 182, 210), # desk (14) + (66, 188, 102), + (219, 219, 141), # curtain (16) + (140, 57, 197), + (202, 185, 52), + (51, 176, 203), + (200, 54, 131), + (92, 193, 61), + (78, 71, 183), + (172, 114, 82), + (255, 127, 14), # refrigerator (25) + (91, 163, 138), + (153, 98, 156), + (140, 153, 101), + (158, 218, 229), # shower curtain (28) + (100, 125, 154), + (178, 127, 135), + (120, 185, 128), + (146, 111, 194), + (44, 160, 44), # toilet (33) + (112, 128, 144), # sink (34) + (96, 207, 209), + (227, 119, 194), # bathtub (36) + (213, 92, 176), + (94, 106, 211), + (82, 84, 163), # otherfurn (39) + (100, 85, 144) + ] + +def normalize(vector): + return vector / np.linalg.norm(vector) + +def parse_camera_info(camera_info, height, width): + """ extract intrinsic and extrinsic matrix + """ + lookat = normalize(camera_info[3:6]) + up = normalize(camera_info[6:9]) + + W = lookat + U = np.cross(W, up) + V = np.cross(W, U) + + rot = np.vstack((U, V, W)) + + trans = camera_info[:3] + + xfov = camera_info[9] + yfov = camera_info[10] + + K = np.diag([1, 1, 1]) + + K[0, 2] = width / 2 + K[1, 2] = height / 2 + + K[0, 0] = K[0, 2] / np.tan(xfov) + K[1, 1] = K[1, 2] / np.tan(yfov) + + return rot, trans, K + +def point_inside_bbox(point, bbox_corners): + """Check if a point is inside a 3D bounding box defined by its 8 corners.""" + min_coords = np.min(bbox_corners, axis=0) + max_coords = np.max(bbox_corners, axis=0) + + return np.all(min_coords <= point) and np.all(point <= max_coords) + +def load_bounding_boxes(bbox_json_path): + """Load 3D bounding boxes from a JSON file.""" + with open(bbox_json_path, 'r') as f: + bboxes = json.load(f) + return bboxes + +def rgb_to_nyu40id(rgb_image): + """Convert RGB values from `semantic.png` to corresponding NYU40 IDs.""" + palette = create_color_palette() + color_to_id = {color: idx for idx, color in enumerate(palette)} + + h, w, _ = rgb_image.shape + rgb_flatten = rgb_image.reshape(-1, 3) + + # Convert each RGB value to corresponding NYU40 ID + nyu40_ids = np.array([color_to_id.get(tuple(rgb), 0) for rgb in rgb_flatten], dtype=np.int32) + + return nyu40_ids.reshape(h, w) + + +def save_ply_with_labels(filename, pointcloud, object_ids, nyu40_ids): + """Save PLY file with object_id and nyu40id.""" + points = np.asarray(pointcloud.points) + colors = (np.asarray(pointcloud.colors) * 255).astype(np.uint8) if pointcloud.has_colors() else np.zeros_like(points, dtype=np.uint8) + + vertex_data = np.array( + list(zip( + points[:, 0], points[:, 1], points[:, 2], # x, y, z + colors[:, 0], colors[:, 1], colors[:, 2], # red, green, blue + np.full(len(points), 255, dtype=np.uint8), # alpha + object_ids, # Object ID + nyu40_ids # NYU40 Semantic ID + )), + dtype=[ + ('x', 'f4'), ('y', 'f4'), ('z', 'f4'), + ('red', 'u1'), ('green', 'u1'), ('blue', 'u1'), ('alpha', 'u1'), + ('object_id', 'i4'), + ('nyu40id', 'i4') + ] + ) + + el = PlyElement.describe(vertex_data, 'vertex') + PlyData([el], text=False).write(filename) + +def process_room(scene_id, room_id, room_path): + """Processes a single room by merging all views and generating a 3D mesh.""" + pcd_list = [] + object_ids_list = [] + nyu40_ids_list = [] + + # Iterate over all views in the room + for view_id in sorted(os.listdir(room_path)): + view_path = os.path.join(room_path, view_id) + + rgb_image_path = os.path.join(view_path, "rgb_rawlight.png") + depth_image_path = os.path.join(view_path, "depth.png") + camera_path = os.path.join(view_path, "camera_pose.txt") + # instance_image_path = os.path.join(view_path, "instance.png") + semantic_image_path = os.path.join(view_path, "semantic.png") + + if not all(os.path.exists(p) for p in [rgb_image_path, depth_image_path, camera_path, semantic_image_path]): + print(f"Skipping Scene {scene_id}, Room {room_id}, View {view_id}: Missing files") + continue + + print(f"Processing Scene {scene_id}, Room {room_id}, View {view_id}...") + + color = cv2.imread(rgb_image_path) + # cv2.imshow("color", color) + # cv2.waitKey(0) + # color = cv2.cvtColor(color, cv2.COLOR_BGR2RGB) + depth = cv2.imread(depth_image_path, cv2.IMREAD_UNCHANGED).astype(np.float32) / 1000.0 # Convert mm to meters + # instance = cv2.imread(instance_image_path, cv2.IMREAD_UNCHANGED) # Object ID image + semantic = cv2.imread(semantic_image_path) # Read as BGR + semantic = cv2.cvtColor(semantic, cv2.COLOR_BGR2RGB) # Convert to RGB + + nyu40_id_image = rgb_to_nyu40id(semantic) + + valid_mask = depth.flatten() > 0 + # object_ids = instance.flatten()[valid_mask] + nyu40_ids = nyu40_id_image.flatten()[valid_mask] + + height, width = color.shape[:2] + camera_info = np.loadtxt(camera_path) + rot, trans, K = parse_camera_info(camera_info, height, width) + trans = np.array(trans) / 1000 + + + color_o3d = o3d.geometry.Image(color) + depth_o3d = o3d.geometry.Image(depth) + rgbd_image = o3d.geometry.RGBDImage.create_from_color_and_depth( + color_o3d, depth_o3d, depth_scale=1.0, depth_trunc=10.0, convert_rgb_to_intensity=False + ) + extrinsic = np.eye(4) + extrinsic[:3, :3] = rot.T + extrinsic[:3, -1] = trans + extrinsic = np.linalg.inv(extrinsic) + + intrinsic = o3d.camera.PinholeCameraIntrinsic(width, height, K[0][0], K[1][1], K[0][2], K[1][2]) + pointcloud = o3d.geometry.PointCloud.create_from_rgbd_image(rgbd_image, intrinsic, extrinsic) + + pcd_list.append(pointcloud) + # object_ids_list.append(object_ids) + nyu40_ids_list.append(nyu40_ids) + + if not pcd_list: + print(f"Skipping Scene {scene_id}, Room {room_id}: No valid views.") + return + + pcd_combined = pcd_list[0] + for pcd in pcd_list[1:]: + pcd_combined += pcd + + object_ids_combined = np.array([-1]*len(np.asarray(pcd_combined.points)), dtype=int) # Initialize object IDs + + # Efficient assignment of object IDs based on bounding box inclusion + points = np.asarray(pcd_combined.points) + colors = np.asarray(pcd_combined.colors) + + + bboxes_json_path = os.path.join(BASE_PATH, scene_id, "bbox_3d.json") + bboxes = load_bounding_boxes(bboxes_json_path) + for idx, bbox in enumerate(bboxes): + basis = np.array(bbox['basis']) + coeffs = np.array(bbox['coeffs']) + centroid = np.array(bbox['centroid']) + bbox_corners = misc.utils.get_corners_of_bb3d_no_index(basis, coeffs, centroid) # 8 corners of the bounding box + bbox_corners = bbox_corners / 1000 + # Create mask for points inside this bounding box + box_min = np.min(bbox_corners, axis=0, keepdims=True) + box_max = np.max(bbox_corners, axis=0, keepdims=True) + # print(min_corner, max_corner) + # print(points) + # mask = np.all((points >= box_min) & (points <= max_corner), axis=1) + point_max_mask = np.all(points < box_max, axis=1) + point_min_mask = np.all(points > box_min, axis=1) + point_mask = np.logical_and(point_max_mask, point_min_mask) + points_in_bbox = points[point_mask] + # print(points_in_bbox.shape) + # if points_in_bbox.shape[0] != 0: + # print(bbox['ID']) + # colors_in_bbox = colors[mask] + # object_pcd = o3d.geometry.PointCloud() + # object_pcd.points = o3d.utility.Vector3dVector(points_in_bbox) + # object_pcd.colors = o3d.utility.Vector3dVector(colors_in_bbox) + # o3d.visualization.draw_geometries([object_pcd]) + # print(np.all(points>=min_corner, axis=1)) + # Assign object ID to points inside this bounding box + object_ids_combined[point_mask] = bbox['ID'] + # o3d.visualization.draw_geometries([pcd_combined]) + + + nyu40_ids_combined = np.concatenate(nyu40_ids_list) + # print(np.unique(object_ids_combined)) + # Save the mesh file + output_dir = os.path.join(BASE_PATH, scene_id, "3D_rendering", room_id) + os.makedirs(output_dir, exist_ok=True) + ply_filename = os.path.join(output_dir, "room_mesh.ply") + + save_ply_with_labels(ply_filename, pcd_combined, object_ids_combined, nyu40_ids_combined) + print(f"Saved mesh for Scene {scene_id}, Room {room_id} -> {ply_filename}") + + +# if __name__ == '__main__': +# for scene_id in sorted(os.listdir(BASE_PATH)): +# scene_path = os.path.join(BASE_PATH, scene_id, "2D_rendering") +# if not os.path.isdir(scene_path): +# continue + +# for room_id in sorted(os.listdir(scene_path)): +# room_path = os.path.join(scene_path, room_id, "perspective", "full") +# if os.path.isdir(room_path): + # process_room(scene_id, room_id, room_path) +def parse_args(): + parser = argparse.ArgumentParser(description='Generate PLY files from Structured3D dataset') + parser.add_argument('--base_path', type=str, default="/Users/gauravpradeep/CrossOver_ScaleUp/Structured3D/scans/", + help='Base path to the Structured3D dataset') + return parser.parse_args() + +if __name__ == '__main__': + args = parse_args() + BASE_PATH = args.base_path + + for scene_id in sorted(os.listdir(BASE_PATH)): + scene_path = os.path.join(BASE_PATH, scene_id, "2D_rendering") + if not os.path.isdir(scene_path): + continue + + for room_id in sorted(os.listdir(scene_path)): + room_path = os.path.join(scene_path, room_id, "perspective", "full") + if os.path.isdir(room_path): + process_room(scene_id, room_id, room_path) +# --------------------------------------- +# instance image based object id assignment +# --------------------------------------- + +# def process_room(scene_id, room_id, room_path): +# """Processes a single room by merging all views and generating a 3D mesh.""" +# pcd_list = [] +# object_ids_list = [] +# nyu40_ids_list = [] + +# # Iterate over all views in the room +# for view_id in sorted(os.listdir(room_path)): +# view_path = os.path.join(room_path, view_id) + +# rgb_image_path = os.path.join(view_path, "rgb_rawlight.png") +# depth_image_path = os.path.join(view_path, "depth.png") +# camera_path = os.path.join(view_path, "camera_pose.txt") +# instance_image_path = os.path.join(view_path, "instance.png") +# semantic_image_path = os.path.join(view_path, "semantic.png") + +# if not all(os.path.exists(p) for p in [rgb_image_path, depth_image_path, camera_path, instance_image_path, semantic_image_path]): +# print(f"Skipping Scene {scene_id}, Room {room_id}, View {view_id}: Missing files") +# continue + +# print(f"Processing Scene {scene_id}, Room {room_id}, View {view_id}...") + +# color = cv2.imread(rgb_image_path) +# depth = cv2.imread(depth_image_path, cv2.IMREAD_UNCHANGED).astype(np.float32) / 1000.0 # Convert mm to meters +# instance = cv2.imread(instance_image_path, cv2.IMREAD_UNCHANGED) # Object ID image +# semantic = cv2.imread(semantic_image_path) # Read as BGR +# semantic = cv2.cvtColor(semantic, cv2.COLOR_BGR2RGB) # Convert to RGB + +# nyu40_id_image = rgb_to_nyu40id(semantic) + +# valid_mask = depth.flatten() > 0 +# object_ids = instance.flatten()[valid_mask] +# nyu40_ids = nyu40_id_image.flatten()[valid_mask] + +# height, width = color.shape[:2] +# camera_info = np.loadtxt(camera_path) +# rot, trans, K = parse_camera_info(camera_info, height, width) +# trans = np.array(trans) / 1000 + + +# color_o3d = o3d.geometry.Image(color) +# depth_o3d = o3d.geometry.Image(depth) +# rgbd_image = o3d.geometry.RGBDImage.create_from_color_and_depth( +# color_o3d, depth_o3d, depth_scale=1.0, depth_trunc=10.0, convert_rgb_to_intensity=False +# ) +# extrinsic = np.eye(4) +# extrinsic[:3, :3] = rot.T +# extrinsic[:3, -1] = trans +# extrinsic = np.linalg.inv(extrinsic) + +# intrinsic = o3d.camera.PinholeCameraIntrinsic(width, height, K[0][0], K[1][1], K[0][2], K[1][2]) +# pointcloud = o3d.geometry.PointCloud.create_from_rgbd_image(rgbd_image, intrinsic, extrinsic) + +# pcd_list.append(pointcloud) +# object_ids_list.append(object_ids) +# nyu40_ids_list.append(nyu40_ids) + +# if not pcd_list: +# print(f"Skipping Scene {scene_id}, Room {room_id}: No valid views.") +# return + +# pcd_combined = pcd_list[0] +# for pcd in pcd_list[1:]: +# pcd_combined += pcd +# # o3d.visualization.draw_geometries([pcd_combined]) + +# object_ids_combined = np.concatenate(object_ids_list) +# nyu40_ids_combined = np.concatenate(nyu40_ids_list) + +# # Save the mesh file +# output_dir = os.path.join(BASE_PATH, scene_id, "3D_rendering", room_id) +# os.makedirs(output_dir, exist_ok=True) +# ply_filename = os.path.join(output_dir, "room_mesh.ply") + +# save_ply_with_labels(ply_filename, pcd_combined, object_ids_combined, nyu40_ids_combined) +# print(f"Saved mesh for Scene {scene_id}, Room {room_id} -> {ply_filename}") + diff --git a/prepare_data/structured3d/misc/colors.py b/prepare_data/structured3d/misc/colors.py new file mode 100644 index 0000000..191f845 --- /dev/null +++ b/prepare_data/structured3d/misc/colors.py @@ -0,0 +1,47 @@ +semantics_cmap = { + 'living room': '#e6194b', + 'kitchen': '#3cb44b', + 'bedroom': '#ffe119', + 'bathroom': '#0082c8', + 'balcony': '#f58230', + 'corridor': '#911eb4', + 'dining room': '#46f0f0', + 'study': '#f032e6', + 'studio': '#d2f53c', + 'store room': '#fabebe', + 'garden': '#008080', + 'laundry room': '#e6beff', + 'office': '#aa6e28', + 'basement': '#fffac8', + 'garage': '#800000', + 'undefined': '#aaffc3', + 'door': '#808000', + 'window': '#ffd7b4', + 'outwall': '#000000', +} + + +colormap_255 = [ + [230, 25, 75], + [ 60, 180, 75], + [255, 225, 25], + [ 0, 130, 200], + [245, 130, 48], + [145, 30, 180], + [ 70, 240, 240], + [240, 50, 230], + [210, 245, 60], + [250, 190, 190], + [ 0, 128, 128], + [230, 190, 255], + [170, 110, 40], + [255, 250, 200], + [128, 0, 0], + [170, 255, 195], + [128, 128, 0], + [255, 215, 180], + [ 0, 0, 128], + [128, 128, 128], + [255, 255, 255], + [ 0, 0, 0] +] \ No newline at end of file diff --git a/prepare_data/structured3d/misc/figures.py b/prepare_data/structured3d/misc/figures.py new file mode 100644 index 0000000..013acbf --- /dev/null +++ b/prepare_data/structured3d/misc/figures.py @@ -0,0 +1,78 @@ +""" +Copy from https://github.com/Toblerity/Shapely/blob/master/docs/code/figures.py +""" + +from math import sqrt +from shapely import affinity + +GM = (sqrt(5)-1.0)/2.0 +W = 8.0 +H = W*GM +SIZE = (W, H) + +BLUE = '#6699cc' +GRAY = '#999999' +DARKGRAY = '#333333' +YELLOW = '#ffcc33' +GREEN = '#339933' +RED = '#ff3333' +BLACK = '#000000' + +COLOR_ISVALID = { + True: BLUE, + False: RED, +} + + +def plot_line(ax, ob, color=GRAY, zorder=1, linewidth=3, alpha=1): + x, y = ob.xy + ax.plot(x, y, color=color, linewidth=linewidth, solid_capstyle='round', zorder=zorder, alpha=alpha) + + +def plot_coords(ax, ob, color=BLACK, zorder=1, alpha=1): + x, y = ob.xy + ax.plot(x, y, color=color, zorder=zorder, alpha=alpha) + + +def color_isvalid(ob, valid=BLUE, invalid=RED): + if ob.is_valid: + return valid + else: + return invalid + + +def color_issimple(ob, simple=BLUE, complex=YELLOW): + if ob.is_simple: + return simple + else: + return complex + + +def plot_line_isvalid(ax, ob, **kwargs): + kwargs["color"] = color_isvalid(ob) + plot_line(ax, ob, **kwargs) + + +def plot_line_issimple(ax, ob, **kwargs): + kwargs["color"] = color_issimple(ob) + plot_line(ax, ob, **kwargs) + + +def plot_bounds(ax, ob, zorder=1, alpha=1): + x, y = zip(*list((p.x, p.y) for p in ob.boundary)) + ax.plot(x, y, 'o', color=BLACK, zorder=zorder, alpha=alpha) + + +def add_origin(ax, geom, origin): + x, y = xy = affinity.interpret_origin(geom, origin, 2) + ax.plot(x, y, 'o', color=GRAY, zorder=1) + ax.annotate(str(xy), xy=xy, ha='center', + textcoords='offset points', xytext=(0, 8)) + + +def set_limits(ax, x0, xN, y0, yN): + ax.set_xlim(x0, xN) + ax.set_xticks(range(x0, xN+1)) + ax.set_ylim(y0, yN) + ax.set_yticks(range(y0, yN+1)) + ax.set_aspect("equal") \ No newline at end of file diff --git a/prepare_data/structured3d/misc/panorama.py b/prepare_data/structured3d/misc/panorama.py new file mode 100644 index 0000000..ba2feef --- /dev/null +++ b/prepare_data/structured3d/misc/panorama.py @@ -0,0 +1,243 @@ +""" +Copy from https://github.com/sunset1995/pytorch-layoutnet/blob/master/pano.py +""" +import numpy as np +import numpy.matlib as matlib + + +def xyz_2_coorxy(xs, ys, zs, H=512, W=1024): + us = np.arctan2(xs, ys) + vs = -np.arctan(zs / np.sqrt(xs**2 + ys**2)) + coorx = (us / (2 * np.pi) + 0.5) * W + coory = (vs / np.pi + 0.5) * H + return coorx, coory + + +def coords2uv(coords, width, height): + """ + Image coordinates (xy) to uv + """ + middleX = width / 2 + 0.5 + middleY = height / 2 + 0.5 + uv = np.hstack([ + (coords[:, [0]] - middleX) / width * 2 * np.pi, + -(coords[:, [1]] - middleY) / height * np.pi]) + return uv + + +def uv2xyzN(uv, planeID=1): + ID1 = (int(planeID) - 1 + 0) % 3 + ID2 = (int(planeID) - 1 + 1) % 3 + ID3 = (int(planeID) - 1 + 2) % 3 + xyz = np.zeros((uv.shape[0], 3)) + xyz[:, ID1] = np.cos(uv[:, 1]) * np.sin(uv[:, 0]) + xyz[:, ID2] = np.cos(uv[:, 1]) * np.cos(uv[:, 0]) + xyz[:, ID3] = np.sin(uv[:, 1]) + return xyz + + +def uv2xyzN_vec(uv, planeID): + """ + vectorization version of uv2xyzN + @uv N x 2 + @planeID N + """ + assert (planeID.astype(int) != planeID).sum() == 0 + planeID = planeID.astype(int) + ID1 = (planeID - 1 + 0) % 3 + ID2 = (planeID - 1 + 1) % 3 + ID3 = (planeID - 1 + 2) % 3 + ID = np.arange(len(uv)) + xyz = np.zeros((len(uv), 3)) + xyz[ID, ID1] = np.cos(uv[:, 1]) * np.sin(uv[:, 0]) + xyz[ID, ID2] = np.cos(uv[:, 1]) * np.cos(uv[:, 0]) + xyz[ID, ID3] = np.sin(uv[:, 1]) + return xyz + + +def xyz2uvN(xyz, planeID=1): + ID1 = (int(planeID) - 1 + 0) % 3 + ID2 = (int(planeID) - 1 + 1) % 3 + ID3 = (int(planeID) - 1 + 2) % 3 + normXY = np.sqrt(xyz[:, [ID1]] ** 2 + xyz[:, [ID2]] ** 2) + normXY[normXY < 0.000001] = 0.000001 + normXYZ = np.sqrt(xyz[:, [ID1]] ** 2 + xyz[:, [ID2]] ** 2 + xyz[:, [ID3]] ** 2) + v = np.arcsin(xyz[:, [ID3]] / normXYZ) + u = np.arcsin(xyz[:, [ID1]] / normXY) + valid = (xyz[:, [ID2]] < 0) & (u >= 0) + u[valid] = np.pi - u[valid] + valid = (xyz[:, [ID2]] < 0) & (u <= 0) + u[valid] = -np.pi - u[valid] + uv = np.hstack([u, v]) + uv[np.isnan(uv[:, 0]), 0] = 0 + return uv + + +def computeUVN(n, in_, planeID): + """ + compute v given u and normal. + """ + if planeID == 2: + n = np.array([n[1], n[2], n[0]]) + elif planeID == 3: + n = np.array([n[2], n[0], n[1]]) + bc = n[0] * np.sin(in_) + n[1] * np.cos(in_) + bs = n[2] + out = np.arctan(-bc / (bs + 1e-9)) + return out + + +def computeUVN_vec(n, in_, planeID): + """ + vectorization version of computeUVN + @n N x 3 + @in_ MN x 1 + @planeID N + """ + n = n.copy() + if (planeID == 2).sum(): + n[planeID == 2] = np.roll(n[planeID == 2], 2, axis=1) + if (planeID == 3).sum(): + n[planeID == 3] = np.roll(n[planeID == 3], 1, axis=1) + n = np.repeat(n, in_.shape[0] // n.shape[0], axis=0) + assert n.shape[0] == in_.shape[0] + bc = n[:, [0]] * np.sin(in_) + n[:, [1]] * np.cos(in_) + bs = n[:, [2]] + out = np.arctan(-bc / (bs + 1e-9)) + return out + + +def lineFromTwoPoint(pt1, pt2): + """ + Generate line segment based on two points on panorama + pt1, pt2: two points on panorama + line: + 1~3-th dim: normal of the line + 4-th dim: the projection dimension ID + 5~6-th dim: the u of line segment endpoints in projection plane + """ + numLine = pt1.shape[0] + lines = np.zeros((numLine, 6)) + n = np.cross(pt1, pt2) + n = n / (matlib.repmat(np.sqrt(np.sum(n ** 2, 1, keepdims=True)), 1, 3) + 1e-9) + lines[:, 0:3] = n + + areaXY = np.abs(np.sum(n * matlib.repmat([0, 0, 1], numLine, 1), 1, keepdims=True)) + areaYZ = np.abs(np.sum(n * matlib.repmat([1, 0, 0], numLine, 1), 1, keepdims=True)) + areaZX = np.abs(np.sum(n * matlib.repmat([0, 1, 0], numLine, 1), 1, keepdims=True)) + planeIDs = np.argmax(np.hstack([areaXY, areaYZ, areaZX]), axis=1) + 1 + lines[:, 3] = planeIDs + + for i in range(numLine): + uv = xyz2uvN(np.vstack([pt1[i, :], pt2[i, :]]), lines[i, 3]) + umax = uv[:, 0].max() + np.pi + umin = uv[:, 0].min() + np.pi + if umax - umin > np.pi: + lines[i, 4:6] = np.array([umax, umin]) / 2 / np.pi + else: + lines[i, 4:6] = np.array([umin, umax]) / 2 / np.pi + + return lines + + +def lineIdxFromCors(cor_all, im_w, im_h): + assert len(cor_all) % 2 == 0 + uv = coords2uv(cor_all, im_w, im_h) + xyz = uv2xyzN(uv) + lines = lineFromTwoPoint(xyz[0::2], xyz[1::2]) + num_sample = max(im_h, im_w) + + cs, rs = [], [] + for i in range(lines.shape[0]): + n = lines[i, 0:3] + sid = lines[i, 4] * 2 * np.pi + eid = lines[i, 5] * 2 * np.pi + if eid < sid: + x = np.linspace(sid, eid + 2 * np.pi, num_sample) + x = x % (2 * np.pi) + else: + x = np.linspace(sid, eid, num_sample) + + u = -np.pi + x.reshape(-1, 1) + v = computeUVN(n, u, lines[i, 3]) + xyz = uv2xyzN(np.hstack([u, v]), lines[i, 3]) + uv = xyz2uvN(xyz, 1) + + r = np.minimum(np.floor((uv[:, 0] + np.pi) / (2 * np.pi) * im_w) + 1, + im_w).astype(np.int32) + c = np.minimum(np.floor((np.pi / 2 - uv[:, 1]) / np.pi * im_h) + 1, + im_h).astype(np.int32) + cs.extend(r - 1) + rs.extend(c - 1) + return rs, cs + + +def draw_boundary_from_cor_id(cor_id, img_src): + im_h, im_w = img_src.shape[:2] + cor_all = [cor_id] + for i in range(len(cor_id)): + cor_all.append(cor_id[i, :]) + cor_all.append(cor_id[(i+2) % len(cor_id), :]) + cor_all = np.vstack(cor_all) + + rs, cs = lineIdxFromCors(cor_all, im_w, im_h) + rs = np.array(rs) + cs = np.array(cs) + + panoEdgeC = img_src.astype(np.uint8) + for dx, dy in [[-1, 0], [1, 0], [0, 0], [0, 1], [0, -1]]: + panoEdgeC[np.clip(rs + dx, 0, im_h - 1), np.clip(cs + dy, 0, im_w - 1), 0] = 0 + panoEdgeC[np.clip(rs + dx, 0, im_h - 1), np.clip(cs + dy, 0, im_w - 1), 1] = 0 + panoEdgeC[np.clip(rs + dx, 0, im_h - 1), np.clip(cs + dy, 0, im_w - 1), 2] = 255 + + return panoEdgeC + + +def coorx2u(x, w=1024): + return ((x + 0.5) / w - 0.5) * 2 * np.pi + + +def coory2v(y, h=512): + return ((y + 0.5) / h - 0.5) * np.pi + + +def u2coorx(u, w=1024): + return (u / (2 * np.pi) + 0.5) * w - 0.5 + + +def v2coory(v, h=512): + return (v / np.pi + 0.5) * h - 0.5 + + +def uv2xy(u, v, z=-50): + c = z / np.tan(v) + x = c * np.cos(u) + y = c * np.sin(u) + return x, y + + +def pano_connect_points(p1, p2, z=-50, w=1024, h=512): + u1 = coorx2u(p1[0], w) + v1 = coory2v(p1[1], h) + u2 = coorx2u(p2[0], w) + v2 = coory2v(p2[1], h) + + x1, y1 = uv2xy(u1, v1, z) + x2, y2 = uv2xy(u2, v2, z) + + if abs(p1[0] - p2[0]) < w / 2: + pstart = np.ceil(min(p1[0], p2[0])) + pend = np.floor(max(p1[0], p2[0])) + else: + pstart = np.ceil(max(p1[0], p2[0])) + pend = np.floor(min(p1[0], p2[0]) + w) + coorxs = (np.arange(pstart, pend + 1) % w).astype(np.float64) + vx = x2 - x1 + vy = y2 - y1 + us = coorx2u(coorxs, w) + ps = (np.tan(us) * x1 - y1) / (vy - np.tan(us) * vx) + cs = np.sqrt((x1 + ps * vx) ** 2 + (y1 + ps * vy) ** 2) + vs = np.arctan2(z, cs) + coorys = v2coory(vs) + + return np.stack([coorxs, coorys], axis=-1) \ No newline at end of file diff --git a/prepare_data/structured3d/misc/utils.py b/prepare_data/structured3d/misc/utils.py new file mode 100644 index 0000000..93c63f9 --- /dev/null +++ b/prepare_data/structured3d/misc/utils.py @@ -0,0 +1,138 @@ +""" +Adapted from https://github.com/thusiyuan/cooperative_scene_parsing/blob/master/utils/sunrgbd_utils.py +""" +import numpy as np + + +def normalize(vector): + return vector / np.linalg.norm(vector) + + +def parse_camera_info(camera_info, height, width): + """ extract intrinsic and extrinsic matrix + """ + lookat = normalize(camera_info[3:6]) + up = normalize(camera_info[6:9]) + + W = lookat + U = np.cross(W, up) + V = -np.cross(W, U) + + rot = np.vstack((U, V, W)) + trans = camera_info[:3] + + xfov = camera_info[9] + yfov = camera_info[10] + + K = np.diag([1, 1, 1]) + + K[0, 2] = width / 2 + K[1, 2] = height / 2 + + K[0, 0] = K[0, 2] / np.tan(xfov) + K[1, 1] = K[1, 2] / np.tan(yfov) + + return rot, trans, K + + +def flip_towards_viewer(normals, points): + points = points / np.linalg.norm(points) + proj = points.dot(normals[:2, :].T) + flip = np.where(proj > 0) + normals[flip, :] = -normals[flip, :] + return normals + + +def get_corners_of_bb3d(basis, coeffs, centroid): + corners = np.zeros((8, 3)) + # order the basis + index = np.argsort(np.abs(basis[:, 0]))[::-1] + # the case that two same value appear the same time + if index[2] != 2: + index[1:] = index[1:][::-1] + basis = basis[index, :] + coeffs = coeffs[index] + # Now, we know the basis vectors are orders X, Y, Z. Next, flip the basis vectors towards the viewer + basis = flip_towards_viewer(basis, centroid) + coeffs = np.abs(coeffs) + corners[0, :] = -basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2] + corners[1, :] = basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2] + corners[2, :] = basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2] + corners[3, :] = -basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2] + + corners[4, :] = -basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2] + corners[5, :] = basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2] + corners[6, :] = basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2] + corners[7, :] = -basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2] + corners = corners + np.tile(centroid, (8, 1)) + return corners + + +def get_corners_of_bb3d_no_index(basis, coeffs, centroid): + corners = np.zeros((8, 3)) + coeffs = np.abs(coeffs) + corners[0, :] = -basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2] + corners[1, :] = basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2] + corners[2, :] = basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2] + corners[3, :] = -basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2] + + corners[4, :] = -basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2] + corners[5, :] = basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2] + corners[6, :] = basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2] + corners[7, :] = -basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2] + + corners = corners + np.tile(centroid, (8, 1)) + return corners + + +def project_3d_points_to_2d(points3d, R_ex, K): + """ + Project 3d points from camera-centered coordinate to 2D image plane + Parameters + ---------- + points3d: numpy array + 3d location of point + R_ex: numpy array + extrinsic camera parameter + K: numpy array + intrinsic camera parameter + Returns + ------- + points2d: numpy array + 2d location of the point + """ + points3d = R_ex.dot(points3d.T).T + x3 = points3d[:, 0] + y3 = -points3d[:, 1] + z3 = np.abs(points3d[:, 2]) + xx = x3 * K[0, 0] / z3 + K[0, 2] + yy = y3 * K[1, 1] / z3 + K[1, 2] + points2d = np.vstack((xx, yy)) + return points2d + + +def project_struct_bdb_to_2d(basis, coeffs, center, R_ex, K): + """ + Project 3d bounding box to 2d bounding box + Parameters + ---------- + basis, coeffs, center, R_ex, K + : K is the intrinsic camera parameter matrix + : Rtilt is the extrinsic camera parameter matrix in right hand coordinates + Returns + ------- + bdb2d: dict + Keys: {'x1', 'x2', 'y1', 'y2'} + The (x1, y1) position is at the top left corner, + the (x2, y2) position is at the bottom right corner + """ + corners3d = get_corners_of_bb3d(basis, coeffs, center) + corners = project_3d_points_to_2d(corners3d, R_ex, K) + bdb2d = dict() + bdb2d['x1'] = int(max(np.min(corners[0, :]), 1)) # x1 + bdb2d['y1'] = int(max(np.min(corners[1, :]), 1)) # y1 + bdb2d['x2'] = int(min(np.max(corners[0, :]), 2*K[0, 2])) # x2 + bdb2d['y2'] = int(min(np.max(corners[1, :]), 2*K[1, 2])) # y2 + # if not check_bdb(bdb2d, 2*K[0, 2], 2*K[1, 2]): + # bdb2d = None + return bdb2d \ No newline at end of file diff --git a/prepare_data/structured3d/move2scan.sh b/prepare_data/structured3d/move2scan.sh new file mode 100644 index 0000000..b4d9643 --- /dev/null +++ b/prepare_data/structured3d/move2scan.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +DATA_DIR="/Users/gauravpradeep/CrossOver_ScaleUp/extracted/Structured3D_bbox/Structured3D" #this should be where all the data was moved-basically the structured3d dir within s3d_bbox +TARGET_DIR="/Users/gauravpradeep/CrossOver_ScaleUp/Structured3D" #this should be the final structured3d dir where you want to move the scans +# Define the target subfolder +SCANS_DIR="$TARGET_DIR/scans" + +# Create the scans folder if it doesn't exist +mkdir -p "$SCANS_DIR" + +# Move all files and directories (except "scans" itself) into the scans folder +for item in "$DATA_DIR"/*; do + # Skip the scans directory + if [[ "$(basename "$item")" == "scans" ]]; then + continue + fi + + # Move the item into the scans folder + echo "Moving $item to $SCANS_DIR" + mv "$item" "$SCANS_DIR" +done + +echo "All files and directories have been moved to the 'scans' folder." \ No newline at end of file diff --git a/prepare_data/structured3d/move_annotation.py b/prepare_data/structured3d/move_annotation.py new file mode 100644 index 0000000..21d3b8f --- /dev/null +++ b/prepare_data/structured3d/move_annotation.py @@ -0,0 +1,34 @@ +import os +import shutil + +def move_annotation_files(source_base_folder, target_base_folder): + # Iterate through scenes in the source folder + for scene in os.listdir(source_base_folder): + source_scene_path = os.path.join(source_base_folder, scene) + target_scene_path = os.path.join(target_base_folder, scene) + + # Ensure the scene exists in both source and target + if not os.path.isdir(source_scene_path) or not os.path.isdir(target_scene_path): + continue + + # Check if annotation_3d.json exists in the source scene folder + annotation_file = os.path.join(source_scene_path, "annotation_3d.json") + if os.path.exists(annotation_file): + target_annotation_file = os.path.join(target_scene_path, "annotation_3d.json") + + print(f"Moving {annotation_file} to {target_annotation_file}") + shutil.move(annotation_file, target_annotation_file) + os.remove(annotation_file) + else: + print(f"No annotation_3d.json found in {source_scene_path}. Skipping.") + +if __name__ == "__main__": + DIR_PREFIX='/Users/gauravpradeep/CrossOver_ScaleUp/' + # Define the source and target base folder paths + # source_base_folder = "Structured3D-1" # Folder where annotation_3d.json currently exists + target_base_folder = DIR_PREFIX+"Structured3D" # Folder where it should be moved + + for folder in ["Structured3D-1", "Structured3D-2", "Structured3D-3", "Structured3D-4", "Structured3D-5", "Structured3D-6", "Structured3D-7", "Structured3D-8", "Structured3D-9", "Structured3D-11", "Structured3D-12", "Structured3D-13", "Structured3D-14", "Structured3D-15", "Structured3D-16", "Structured3D-17", "Structured3D-18"]: + source_base_folder = DIR_PREFIX+folder + move_annotation_files(source_base_folder, target_base_folder) + diff --git a/prepare_data/structured3d/move_data.py b/prepare_data/structured3d/move_data.py new file mode 100644 index 0000000..e623e78 --- /dev/null +++ b/prepare_data/structured3d/move_data.py @@ -0,0 +1,79 @@ +import os +import shutil +import os.path as osp +def combine_files(in_folder, out_folder): + count = 0 + # Iterate through scenes in the first base folder + for scene in os.listdir(in_folder): + scene_path_in = os.path.join(in_folder, scene) + scene_path_out = os.path.join(out_folder, scene) # Corresponding scene in the second base folder + print(f"Processing scene: {scene_path_in} -> {scene_path_out}") + if not os.path.isdir(scene_path_in) or not os.path.isdir(scene_path_out): + continue + annotation_file_in = os.path.join(scene_path_in, "annotation_3d.json") + annotation_file_out = os.path.join(scene_path_out, "annotation_3d.json") + # print(f"Annotation file in: {annotation_file_in}, Annotation file out: {annotation_file_out}") + shutil.move(annotation_file_in, annotation_file_out) + print(f"Moved {annotation_file_in} to {annotation_file_out}") + + rendering_path_in = os.path.join(scene_path_in, "2D_rendering") + rendering_path_out = os.path.join(scene_path_out, "2D_rendering") + print(f"Rendering path in: {rendering_path_in}, Rendering path out: {rendering_path_out}") + if not os.path.exists(rendering_path_in) or not os.path.exists(rendering_path_out): + continue + + # Iterate through render IDs + for render_id in os.listdir(rendering_path_in): + print("hello") + render_path_in = os.path.join(rendering_path_in, render_id) + render_path_out = os.path.join(rendering_path_out, render_id) + + perspective_path_in = os.path.join(render_path_in, "perspective", "full") + perspective_path_out = os.path.join(render_path_out, "perspective", "full") + print(f"Perspective path in: {perspective_path_in}, Perspective path out: {perspective_path_out}") + + if not os.path.exists(perspective_path_in) or not os.path.exists(perspective_path_out): + continue + print(f"Processing render ID: {render_id} -> {render_path_in} -> {render_path_out}") + # Iterate through views in the perspective folder + for view in os.listdir(perspective_path_in): + view_path_in = os.path.join(perspective_path_in, view) + view_path_out = os.path.join(perspective_path_out, view) + print(f"Processing view: {view_path_in} -> {view_path_out}") + + if not os.path.isdir(view_path_in) or not os.path.isdir(view_path_out): + continue + + # Check if instance.png exists in the target perspective folder + instance_file = os.path.join(view_path_out, "instance.png") + if not os.path.exists(instance_file): + print(f"No instance.png found in {view_path_out}. Skipping.") + continue + + # Move all files from the source folder to the target folder + for file_name in os.listdir(view_path_in): + source_file = os.path.join(view_path_in, file_name) + target_file = os.path.join(view_path_out, file_name) + + if os.path.isfile(source_file): + print(f"Moving {source_file} to {target_file}") + shutil.move(source_file, target_file) + + # Optionally: Remove the now-empty view folder + if not os.listdir(view_path_in): + print(f"Removing empty folder: {view_path_in}") + os.rmdir(view_path_in) + count += 1 + print(f"Processed {count} scenes.") + +if __name__ == "__main__": + EXTRACTED_DIR='/Users/gauravpradeep/CrossOver_ScaleUp/extracted' + out_dir = osp.join(EXTRACTED_DIR,"Structured3D_bbox/Structured3D") # this dir is the one that has perspective instance.png and bbox3d.json + print(out_dir) + for folder in os.listdir(EXTRACTED_DIR): + if folder == 'Structured3D_bbox': + continue + in_folder = osp.join(EXTRACTED_DIR, folder, "Structured3D") + print(f"Processing folder: {in_folder}") + combine_files(in_folder, out_dir) + diff --git a/prepare_data/structured3d/save_floorplan.py b/prepare_data/structured3d/save_floorplan.py new file mode 100644 index 0000000..efa2391 --- /dev/null +++ b/prepare_data/structured3d/save_floorplan.py @@ -0,0 +1,170 @@ +import argparse +import json +import os + +import matplotlib.pyplot as plt +import numpy as np +from matplotlib import colors +from shapely.geometry import Polygon, Point +from shapely.plotting import plot_polygon + +from misc.colors import semantics_cmap +from misc.utils import get_corners_of_bb3d_no_index + +rooms = [ + "living room", + "kitchen", + "bedroom", + "bathroom", + "balcony", + "corridor", + "dining room", + "study", + "studio", + "store room", + "garden", + "laundry room", + "office", + "basement", + "garage", + "undefined" +] + +def convert_lines_to_vertices(lines): + """convert line representation to polygon vertices + """ + polygons = [] + lines = np.array(lines) + + polygon = None + while len(lines) != 0: + if polygon is None: + polygon = lines[0].tolist() + lines = np.delete(lines, 0, 0) + + lineID, juncID = np.where(lines == polygon[-1]) + vertex = lines[lineID[0], 1 - juncID[0]] + lines = np.delete(lines, lineID, 0) + + if vertex in polygon: + polygons.append(polygon) + polygon = None + else: + polygon.append(vertex) + + return polygons + + +def visualize_floorplan(scene_path): + """visualize floorplan + """ + with open(os.path.join(scene_path, "annotation_3d.json")) as file: + annos = json.load(file) + + with open(os.path.join(scene_path, "bbox_3d.json")) as file: + boxes = json.load(file) + + # extract the floor in each semantic for floorplan visualization + planes = [] + for semantic in annos['semantics']: + for planeID in semantic['planeID']: + if annos['planes'][planeID]['type'] == 'floor': + planes.append({'planeID': planeID, 'type': semantic['type'], 'room_ID': semantic['ID']}) + + if semantic['type'] == 'outwall': + outerwall_planes = semantic['planeID'] + + # extract hole vertices + lines_holes = [] + for semantic in annos['semantics']: + if semantic['type'] in ['window', 'door']: + for planeID in semantic['planeID']: + lines_holes.extend(np.where(np.array(annos['planeLineMatrix'][planeID]))[0].tolist()) + lines_holes = np.unique(lines_holes) + + # junctions on the floor + junctions = np.array([junc['coordinate'] for junc in annos['junctions']]) + junction_floor = np.where(np.isclose(junctions[:, -1], 0))[0] + + # construct each polygon + polygons = [] + for plane in planes: + lineIDs = np.where(np.array(annos['planeLineMatrix'][plane['planeID']]))[0].tolist() + junction_pairs = [np.where(np.array(annos['lineJunctionMatrix'][lineID]))[0].tolist() for lineID in lineIDs] + polygon = convert_lines_to_vertices(junction_pairs) + polygons.append([polygon[0], plane['type'], plane['room_ID']]) + + outerwall_floor = [] + for planeID in outerwall_planes: + lineIDs = np.where(np.array(annos['planeLineMatrix'][planeID]))[0].tolist() + lineIDs = np.setdiff1d(lineIDs, lines_holes) + junction_pairs = [np.where(np.array(annos['lineJunctionMatrix'][lineID]))[0].tolist() for lineID in lineIDs] + for start, end in junction_pairs: + if start in junction_floor and end in junction_floor: + outerwall_floor.append([start, end]) + + outerwall_polygon = convert_lines_to_vertices(outerwall_floor) + polygons.append([outerwall_polygon[0], 'outwall', 0]) + + junctions = np.array([junc['coordinate'][:2] for junc in annos['junctions']]) + + room_polygons = {} + for (polygon, poly_type, room_id) in polygons: + if poly_type in rooms: + if poly_type not in room_polygons: + room_polygons[room_id] = [] + room_polygons[room_id].append(polygon) + + floorplans_dir = os.path.join(scene_path, 'floorplans') + os.makedirs(floorplans_dir, exist_ok=True) + + for room_id, room_polys in room_polygons.items(): + fig = plt.figure() + ax = fig.add_subplot(1, 1, 1) + room_polygon_objects = [] + for polygon in room_polys: + polygon = np.array(polygon + [polygon[0], ]) + polygon = Polygon(junctions[polygon]) + room_polygon_objects.append(polygon) + room_type = next((item['type'] for item in annos['semantics'] if item['ID'] == room_id)) + plot_polygon(polygon, ax=ax, add_points=False, facecolor=semantics_cmap[room_type], alpha=0.5) + + for bbox in boxes: + basis = np.array(bbox['basis']) + coeffs = np.array(bbox['coeffs']) + centroid = np.array(bbox['centroid']) + + corners = get_corners_of_bb3d_no_index(basis, coeffs, centroid) + corners = corners[[0, 1, 2, 3, 0], :2] + + bbox_polygon = Polygon(corners) + for room_polygon in room_polygon_objects: + if room_polygon.contains(Point(centroid[:2])): + plot_polygon(bbox_polygon, ax=ax, add_points=False, facecolor=colors.rgb2hex(np.random.rand(3)), alpha=0.5) + + + plt.axis('equal') + plt.axis('off') + output_file = os.path.join(floorplans_dir, f"{room_id}.png") + plt.savefig(output_file, format='png', dpi=300, bbox_inches='tight', pad_inches=0) + plt.close(fig) + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Structured3D Floorplan Visualization") + parser.add_argument("--path", required=True, + help="dataset path", metavar="DIR") + return parser.parse_args() + + +def main(): + args = parse_args() + scenes = [d for d in os.listdir(args.path) if os.path.isdir(os.path.join(args.path, d)) and d.startswith('scene_')] + for scene in scenes: + scene_path = os.path.join(args.path, scene) + visualize_floorplan(scene_path) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/prepare_data/structured3d/uni3dscene.py b/prepare_data/structured3d/uni3dscene.py new file mode 100644 index 0000000..e1c6ec7 --- /dev/null +++ b/prepare_data/structured3d/uni3dscene.py @@ -0,0 +1,417 @@ +# pylint: disable=no-member +import os +import io +import json +import pickle +from typing import List, Tuple, Dict +import multiprocessing +import cv2 +import numpy as np +from PIL import Image as pil_image + +from utils.config import ProcessUnit, EnvsConfig +from utils.nyu_40 import NYU40 +from utils.s3dutilize import S3DUtilize, Annotations +from utils.base_dataset import DatasetBase +import argparse + +BASE_DIR = '/Users/gauravpradeep/Crossover_ScaleUp/Structured3D/scans' +class Structured3DDataGen(DatasetBase): + """ + Dataset generation for Structured3D. + + Two separated folders will be created in target folder -- points and semantic_mask. + Points will be saved a .bin file with raw shape [N, 6] (3 for XYZ, 3 for RGB) + and data type np.float32. Semantic mask will be saved a .bin file with raw shape + [N] and data type np.int64. + """ + IMAGE_PREFIX = '/2D_rendering' + + PERSPECTIVE_PREFIX = '/perspective/full' + PRSP_CAM_FILE = 'camera_pose.txt' + + PANORAMIC_PREFIX = '/panorama/full' + PANO_CAM_PREFIX = '/panorama' + PANO_CAM_FILE = 'camera_xyz.txt' + + SEMANTIC_FILE = 'semantic.png' + RGB_FILE = 'rgb_rawlight.png' + DEPTH_FILE = 'depth.png' + + ANNO_FILE = 'bbox_3d.json' + + def __init__(self, proc_units: List[ProcessUnit], envs: EnvsConfig) -> None: + super().__init__(proc_units, envs) + self._zip_folder = BASE_DIR + + def _get_rooms_list_by_types(self, room_types: List[str]) -> List[str]: + assert len(room_types) == 1 and 'all' in room_types + scenes_list = [d for d in os.listdir(self._zip_folder) if os.path.isdir(os.path.join(self._zip_folder, d))] + rooms_list = list() + for scene_path in scenes_list: + rooms_name = os.listdir(os.path.join(self._zip_folder, scene_path, __class__.IMAGE_PREFIX.strip('/'))) + rooms_list.extend([os.path.join(scene_path, __class__.IMAGE_PREFIX.strip('/'), _r) for _r in rooms_name]) + return rooms_list + + @staticmethod + def read_camera_and_image(cam_path: str, info_flags: int, info_root: str) -> Tuple[List, List[np.ndarray]]: + """ + Read camera poses and images from the file system + + Args: + cam_path (str): the relative path of camera + info_flags (int): the flag of the type of images to be read + + Returns: + Tuple[List, List[np.ndarray]]: Camera information and a list of images + """ + if info_root is None: + info_root = cam_path[:cam_path.rfind('/')] + + out_cams = list() + if info_flags & 1: + # Load camera poses + z2y_top_m = np.array([[0, 1, 0], [0, 0, 1], [1, 0, 0]], dtype=np.float32) + with open(cam_path, 'r') as f: + cam_extr = np.fromstring(f.read(), dtype=np.float32, sep=' ') + cam_t = np.matmul(z2y_top_m, cam_extr[:3] / 1000) + if cam_extr.shape[0] > 3: + cam_r = S3DUtilize.get_rotation_matrix_from_tu(cam_extr[3:6], cam_extr[6:9]) + cam_r = np.matmul(z2y_top_m, cam_r) + cam_hf = cam_extr[9:11] + else: + cam_r = np.eye(3, dtype=np.float32) + cam_hf = None + out_cams.extend([cam_r, cam_t, cam_hf]) + out_images = list() + if info_flags & 2: + # Load depth image + depth_image = cv2.imread(os.path.join(info_root, __class__.DEPTH_FILE), cv2.IMREAD_UNCHANGED)[..., np.newaxis] + depth_image[depth_image == 0] = 65535 + out_images.append(depth_image) + if info_flags & 4: + # Load RGB image + color_image = cv2.imread(os.path.join(info_root, __class__.RGB_FILE), cv2.IMREAD_UNCHANGED)[..., :3][..., ::-1] + out_images.append(color_image) + if info_flags & 8: + # Load semantic image + smnt_image = np.array(pil_image.open(os.path.join(info_root, __class__.SEMANTIC_FILE)))[..., np.newaxis] + out_images.append(smnt_image) + return out_cams, out_images + + @staticmethod + def normal_from_cross_product(points_2d: np.ndarray) -> np.ndarray: + xyz_points_pad = np.pad(points_2d, ((0, 1), (0, 1), (0, 0)), mode='symmetric') + xyz_points_ver = (xyz_points_pad[:, :-1, :] - xyz_points_pad[:, 1:, :])[:-1, :, :] + xyz_points_hor = (xyz_points_pad[:-1, :, :] - xyz_points_pad[1:, :, :])[:, :-1, :] + xyz_normal = np.cross(xyz_points_hor, xyz_points_ver) + xyz_dist = np.linalg.norm(xyz_normal, axis=-1, keepdims=True) + xyz_normal = np.divide(xyz_normal, xyz_dist, out=np.zeros_like(xyz_normal), where=xyz_dist != 0) + return xyz_normal + + @staticmethod + def view2points_prsp(cam_paras: List[np.ndarray], attr_images: List[np.ndarray], cos_thrsh=0.15): + """ + View to 3D points casting of a single perspective image + + Args: + cam_paras (List[np.ndarray]): camera parameters + attr_images (List[np.ndarray]): a list of images to be casted + cos_thrsh (float, optional): the cosine threshold to filtering interpolated depth. Defaults to 0.15. + + Returns: + Tuple[np.ndarray, np.ndarray, np.ndarray] + """ + depth_img, color_img, smnt_img = attr_images + cam_r, cam_t, cam_hf = cam_paras + img_size = np.asarray(depth_img.shape[:2])[::-1] + cam_focal = img_size / 2 / np.tan(cam_hf) + cam_fov_d = S3DUtilize.get_fov_normal(img_size, cam_focal).astype(np.float32) + v_points = S3DUtilize.cast_perspective_to_local_coord(depth_img, cam_fov_d) + vi_normals = __class__.normal_from_cross_product(v_points) + + # Filtering invalid points + view_dist = np.maximum(np.linalg.norm(v_points, axis=-1, keepdims=True), float(10e-5)) + cosine_dist = np.sum((v_points * vi_normals / view_dist), axis=-1, keepdims=True) + cosine_dist = np.abs(cosine_dist) + point_valid = cosine_dist > cos_thrsh + depth_valid = depth_img < 65535 + smnt_valid = smnt_img > 0 + all_valid = (point_valid & depth_valid & smnt_valid)[..., 0] + + v_points = np.matmul(v_points / 1000, cam_r.T) + cam_t + v_normal = __class__.normal_from_cross_product(v_points) + + return v_points[all_valid], color_img[all_valid], v_normal[all_valid], smnt_img[all_valid] + + @staticmethod + def view2points_pano(cam_paras: List[np.ndarray], attr_images: List[np.ndarray], cos_thrsh=0.15): + """ + View to 3D points casting of a single panoramic image + + Args: + cam_paras (List[np.ndarray]): camera parameters + attr_images (List[np.ndarray]): a list of images to be casted + + Returns: + Tuple[np.ndarray, np.ndarray, np.ndarray] + """ + depth_img, color_img, smnt_img = attr_images + _, cam_t, _ = cam_paras + p_h, p_w = attr_images[0].shape[:2] + p_a = np.arange(p_w, dtype=np.float32) / p_w * 2 * np.pi - np.pi + p_b = np.arange(p_h, dtype=np.float32) / p_h * np.pi * -1 + np.pi/2 + p_a = np.tile(p_a[None], [p_h, 1])[..., np.newaxis] + p_b = np.tile(p_b[:, None], [1, p_w])[..., np.newaxis] + p_a_sin, p_a_cos, p_b_sin, p_b_cos = np.sin(p_a), np.cos(p_a), np.sin(p_b), np.cos(p_b) + point_x = depth_img * p_a_cos * p_b_cos + point_y = depth_img * p_b_sin + point_z = depth_img * p_a_sin * p_b_cos + points = np.concatenate([point_x, point_y, point_z], axis=-1) / 1000 + vi_normals = __class__.normal_from_cross_product(points) + # Filtering invalid points + view_dist = np.maximum(np.linalg.norm(points, axis=-1, keepdims=True), float(10e-5)) + cosine_dist = np.sum((points * vi_normals / view_dist), axis=-1, keepdims=True) + cosine_dist = np.abs(cosine_dist) + point_valid = cosine_dist > cos_thrsh + all_valid = (point_valid & (depth_img < 65535) & (smnt_img > 0))[..., 0] + + points = points + cam_t + + return points[all_valid], color_img[all_valid], vi_normals[all_valid], smnt_img[all_valid] + + @staticmethod + def _points2voxel(attr_points: List[np.ndarray], res=0.005) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + p_points, p_colors, p_labels, p_normals = attr_points + + try: + vd_points = np.floor(p_points / res).astype(np.int64) + vd_max = np.max(vd_points, axis=0) + vd_min = np.min(vd_points, axis=0) + vd_box = np.cumprod([1, *(vd_max - vd_min)[:2]]) + + vd_indices = np.sum((vd_points - vd_min[np.newaxis, ...]) * vd_box[np.newaxis, ...], axis=-1) + _, vd_uni = np.unique(vd_indices, return_index=True) + except ValueError: + return None, None, None, None + + return p_points[vd_uni], p_colors[vd_uni], p_labels[vd_uni], p_normals[vd_uni] + + @staticmethod + def _view2points(room_path: str) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + prsp_root = os.path.join(room_path, __class__.PERSPECTIVE_PREFIX.strip('/')) + cam_paths = [os.path.join(prsp_root, f) for f in os.listdir(os.path.join(BASE_DIR,prsp_root)) if f.endswith(__class__.PRSP_CAM_FILE)] + all_infos = list() + for cam_path in cam_paths: + cam_paras, attr_images = __class__.read_camera_and_image(cam_path, 15, None) + r_points, r_colors, r_normal, r_labels = __class__.view2points_prsp(cam_paras, attr_images) + all_infos.append((r_points, r_colors, r_normal, r_labels)) + + pano_cam_root = os.path.join(room_path, __class__.PANO_CAM_PREFIX.strip('/')) + cam_paths = [os.path.join(BASE_DIR, pano_cam_root, f) for f in os.listdir(os.path.join(BASE_DIR, pano_cam_root)) if f.endswith(__class__.PANO_CAM_FILE)] + for cam_path in cam_paths: + pano_root = os.path.dirname(os.path.dirname(cam_path)) + pano_root = os.path.join(pano_root, __class__.PANORAMIC_PREFIX.strip('/')) + cam_paras, attr_images = __class__.read_camera_and_image(cam_path, 15, pano_root) + r_points, r_colors, r_normal, r_labels = __class__.view2points_pano(cam_paras, attr_images) + all_infos.append((r_points, r_colors, r_normal, r_labels)) + + a_points = np.concatenate([_i[0] for _i in all_infos], axis=0) + a_colors = np.concatenate([_i[1] for _i in all_infos], axis=0) + a_normals = np.concatenate([_i[2] for _i in all_infos], axis=0) + a_labels = np.concatenate([_i[3] for _i in all_infos], axis=0) + + a_points = a_points[..., [2, 0, 1]] # Convert Y-top to Z-top + a_normals = a_normals[..., [2, 0, 1]] + # print(len(a_points), len(a_colors), len(a_labels), len(a_normals)) + return a_points, a_colors, a_labels, a_normals + + @staticmethod + def _read_instance_infos(room_path: str, points: np.ndarray, labels: np.ndarray, min_pts=50) -> Dict: + scene_id, _, _ = room_path.split('/') + anno_file = os.path.join(BASE_DIR,scene_id, __class__.ANNO_FILE) + if not os.path.exists(anno_file): + return None + + with open(anno_file, 'r') as f: + boxes_info: List[Dict] = json.load(f) + + anno_infos = Annotations() + rb_idx = 0 # room bounding box ID + obj2tgid={} + for box_info in boxes_info: + + b_id = int(box_info['ID']) + centroid = np.asarray(box_info['centroid'], dtype=np.float32) / 1000 + coeffs = np.asarray(box_info['coeffs'], dtype=np.float32) / 1000 + basis = np.asarray(box_info['basis'], dtype=np.float32) + obb_8pts = S3DUtilize.get_8points_bounding_box(basis, coeffs, centroid) + + box_min = np.min(obb_8pts, axis=0, keepdims=True) + box_max = np.max(obb_8pts, axis=0, keepdims=True) + + point_max_mask = np.all(points < box_max, axis=1) + point_min_mask = np.all(points > box_min, axis=1) + point_mask = np.logical_and(point_max_mask, point_min_mask) + box_points: np.ndarray = points[point_mask] + if box_points.size < min_pts: + continue + + box_instances = labels[point_mask][..., 0] + instance_id, instance_count = np.unique(box_instances, return_counts=True) + instance_id = instance_id[np.argmax(instance_count)] + + instance_points = box_points[box_instances == instance_id] + ip_box_min = np.min(instance_points, axis=0) + ip_box_max = np.max(instance_points, axis=0) + dimension = np.maximum(centroid - ip_box_min, ip_box_max - centroid) + + ur_depth = np.concatenate([centroid, dimension * 2], axis=0) + + anno_infos.index.append(rb_idx) + anno_infos.classes.append(instance_id) + anno_infos.name.append(NYU40.index_to_label(instance_id)) + anno_infos.location.append(centroid) + anno_infos.dimensions.append(dimension) + anno_infos.gt_boxes_upright_depth.append(ur_depth) + anno_infos.unaligned_location.append(centroid) + anno_infos.unaligned_dimensions.append(dimension) + anno_infos.unaligned_gt_boxes_upright_depth.append(ur_depth) + obj2tgid[b_id] = rb_idx + rb_idx += 1 + + obj2tgid_path = os.path.join(BASE_DIR, room_path, "obj2tgid.json") + with open(obj2tgid_path, 'w') as json_file: + json.dump(obj2tgid, json_file) + + anno_infos.gt_num = rb_idx + anno_infos.axis_align_matrix = np.eye(4, dtype=np.float64) + return anno_infos.dump() + + def _mp_format_dataset(self, rooms_list: List[str], proc_unit: ProcessUnit, start_index=0, worker_id=0): + del start_index, worker_id + + points_folder = self.envs.get_env_path(proc_unit.out_paths[0]) + os.makedirs(points_folder, exist_ok=True) + semantics_folder = self.envs.get_env_path(proc_unit.out_paths[1]) + os.makedirs(semantics_folder, exist_ok=True) + instance_folder = self.envs.get_env_path(proc_unit.out_paths[2]) + os.makedirs(instance_folder, exist_ok=True) + annotation_folder = self.envs.get_env_path(proc_unit.out_paths[3]) + os.makedirs(annotation_folder, exist_ok=True) + + for _, room_path in enumerate(rooms_list): + if '.' in room_path: + continue + scene_id, _, room_id = room_path.split('/') + dump_name = f'{scene_id}_{room_id}_1cm.bin' + points_path = os.path.join(points_folder, dump_name) + semantics_path = os.path.join(semantics_folder, dump_name) + instance_path = os.path.join(instance_folder, dump_name) + annotation_path = os.path.join(annotation_folder, dump_name) + if np.all([os.path.exists(_path) for _path in [points_path, semantics_path, annotation_path]]): + continue + + # Step 1: Read images and make point clouds + + a_points, a_colors, a_labels, a_normals = self._view2points(room_path) + v_points, v_colors, v_labels, v_normals = self._points2voxel((a_points, a_colors, a_labels, a_normals), 0.01) + if v_points is None: + print(f'Ignore {room_path} with invalid points') + continue + # Step 2: Read bounding box information + anno_infos = self._read_instance_infos(room_path, v_points, v_labels) + if anno_infos is None: + print(f'Ignore {room_path} with invalid annotations') + continue + # print(v_points.shape) + # print(v_colors.shape) + # print(v_labels.shape) + # print(v_normals.shape) + np.concatenate([v_points.astype(np.float32), v_colors.astype(np.float32), v_normals.astype(np.float32)], axis=-1).tofile(points_path) + v_labels.astype(np.int64).tofile(semantics_path) + with open(annotation_path, 'wb') as a_fp: + pickle.dump(anno_infos, a_fp) + + def multiple_processor(func, samples: List, workers, args: Tuple): + samples_per_worker = int((len(samples) - 1) / workers + 1) + processes = list() + for w in range(workers): + start_index = w * samples_per_worker + end_index = min((w + 1) * samples_per_worker, len(samples)) + f_args = (samples[start_index: end_index], ) + args + (start_index, w) + t = multiprocessing.Process(target=func, args=f_args) + processes.append(t) + t.start() + for p in processes: + p.join() + + def format_dataset(self, proc_unit: ProcessUnit): + attrs = proc_unit.attrs + + desc_dir = os.path.join(self.envs.out_data_root, 'desc') + os.makedirs(desc_dir, exist_ok=True) + with open(os.path.join(desc_dir, proc_unit.out_paths[0]), 'wb') as b_fp: + pickle.dump(np.zeros([0, 9], np.float32), b_fp) + with open(os.path.join(desc_dir, proc_unit.out_paths[1]), 'wb') as b_fp: + pickle.dump(np.zeros([0], np.int64), b_fp) + + rooms_list = self._get_rooms_list_by_types(attrs['room_types']) + + __class__.multiple_processor(self._mp_format_dataset, rooms_list, 8, \ + (proc_unit, )) +# def main(): +# # Create the environment configuration instance +# envs = EnvsConfig() +# envs.out_data_root = "/Users/gauravpradeep/Crossover_ScaleUp/Structured3D/uni3d_output" +# envs.in_data_root = "/Users/gauravpradeep/Crossover_ScaleUp/Structured3D/scans" +# # Add other necessary environment variables here + +# # Define the process unit +# proc_unit = ProcessUnit() +# proc_unit.in_paths = ["data"] +# proc_unit.out_paths = ["points", "semantic_mask", "instance", "annotations"] +# proc_unit.attrs = {"room_types": ["all"]} + +# # Create the Structured3DDataGen instance +# data_gen = Structured3DDataGen([proc_unit], envs) + +# # Run the dataset formatting +# data_gen.format_dataset(proc_unit) + +# if __name__ == "__main__": +# main() + +def parse_args(): + parser = argparse.ArgumentParser(description='Process Structured3D dataset') + parser.add_argument('--base_dir', type=str, + default='/Users/gauravpradeep/Crossover_ScaleUp/Structured3D/scans', + help='Base directory for scans') + parser.add_argument('--out_data_root', type=str, + default='/Users/gauravpradeep/Crossover_ScaleUp/Structured3D/uni3d_output', + help='Output data root directory') + parser.add_argument('--in_data_root', type=str, + default='/Users/gauravpradeep/Crossover_ScaleUp/Structured3D/scans', + help='Input data root directory') + return parser.parse_args() + +def main(): + args = parse_args() + global BASE_DIR + BASE_DIR = args.base_dir + + envs = EnvsConfig() + envs.out_data_root = args.out_data_root + envs.in_data_root = args.in_data_root + + proc_unit = ProcessUnit() + proc_unit.in_paths = ["data"] + proc_unit.out_paths = ["points", "semantic_mask", "instance", "annotations"] + proc_unit.attrs = {"room_types": ["all"]} + + data_gen = Structured3DDataGen([proc_unit], envs) + + data_gen.format_dataset(proc_unit) + +if __name__ == "__main__": + main() diff --git a/prepare_data/structured3d/unzip_data.sh b/prepare_data/structured3d/unzip_data.sh new file mode 100644 index 0000000..73e53ba --- /dev/null +++ b/prepare_data/structured3d/unzip_data.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# Directory containing the zip files; update this as needed. +S3D_DOWNLOAD_DIR="/Users/gauravpradeep/CrossOver_ScaleUp" + +# Find and sort all Structured3D_*.zip files in the directory +cd "$S3D_DOWNLOAD_DIR" || exit 1 + +for zip_file in Structured3D_*.zip; do + # Skip if no files match + [ -e "$zip_file" ] || continue + extract_dir="${zip_file%.zip}" + echo "Extracting $zip_file..." + mkdir -p "$extract_dir" + unzip -q "$zip_file" -d "$extract_dir" +done + +echo "Done extracting all zips." \ No newline at end of file diff --git a/prepare_data/structured3d/utils/base_dataset.py b/prepare_data/structured3d/utils/base_dataset.py new file mode 100644 index 0000000..c59fc7e --- /dev/null +++ b/prepare_data/structured3d/utils/base_dataset.py @@ -0,0 +1,18 @@ +""" +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +""" +from abc import abstractmethod + +from utils.config import ProcessUnit, EntryBase + + +class DatasetBase(EntryBase): + """ + The base class of dataset + """ + @abstractmethod + def format_dataset(self, proc_unit: ProcessUnit): + """ + Construct 3D point cloud from views + """ \ No newline at end of file diff --git a/prepare_data/structured3d/utils/config.py b/prepare_data/structured3d/utils/config.py new file mode 100644 index 0000000..b5f89db --- /dev/null +++ b/prepare_data/structured3d/utils/config.py @@ -0,0 +1,270 @@ +""" +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +""" +# pylint: disable=logging-fstring-interpolation +import logging +import re +import multiprocessing as mp +from abc import abstractmethod +from typing import List, Dict, Tuple +import multiprocessing +import yaml +import json +import copy +import inspect +import logging +import numpy as np +import os + +class DictRecursive(object): + def __init__(self): + pass + + def load(self, kargs: dict, shared_dict=None): + """ + Launch args of class from a dict. All subclass of DictRecursive will call this function automatically. Supported + types includes int, float, list, str and DictRecursive + + Args: + kargs: a dict saved the pairs of name/value of attributions + shared_dict: a shared item used by all other items + """ + if shared_dict is None: + shared_dict = {} + for cls_arg_name in self.__dict__.keys(): + arg_value = None + if kargs is not None: + arg_value = kargs[cls_arg_name] if cls_arg_name in kargs.keys() else None + if shared_dict is not None: + arg_value = shared_dict[cls_arg_name] if cls_arg_name in shared_dict.keys() else arg_value + cls_arg = self.__dict__[cls_arg_name] + self.__dict__[cls_arg_name] = self.parse_single_arg(cls_arg, arg_value, shared_dict) + return self + + def save(self): + save_dict = {} + for cls_arg_name in self.__dict__.keys(): + save_dict[cls_arg_name] = self.inverse_single_arg(self.__dict__[cls_arg_name]) + return save_dict + + def load_from_yaml(self, yaml_path, shared_scope=''): + with open(yaml_path, 'r', encoding='utf-8') as fp: + cfg_cxt = yaml.load(fp.read(), Loader=yaml.FullLoader) + self.load(cfg_cxt, cfg_cxt[shared_scope] if shared_scope in cfg_cxt.keys() else dict()) + + def load_from_json(self, json_path): + with open(json_path, 'r', encoding='utf-8') as fp: + self.load(json.load(fp)) + + def save_to_json(self, json_path): + with open(json_path, 'w') as fp: + save_meta = self.save() + json.dump(self.save(), fp) + + @staticmethod + def inverse_single_arg(arg_value): + if issubclass(type(arg_value), DictRecursive): + return arg_value.save() + elif isinstance(arg_value, list): + list_arg_value = list() + for a_v in arg_value: + list_arg_value.append(DictRecursive.inverse_single_arg(a_v)) + return list_arg_value + elif isinstance(arg_value, np.ndarray): + return arg_value.tolist() + else: + return arg_value + + @staticmethod + def parse_single_arg(cls_arg, arg_value, shared_dict=None): + if isinstance(cls_arg, int): + cls_arg_value = int(arg_value) if arg_value is not None else cls_arg + elif isinstance(cls_arg, str): + cls_arg_value = str(arg_value) if arg_value is not None else cls_arg + elif isinstance(cls_arg, float): + cls_arg_value = float(arg_value) if arg_value is not None else cls_arg + elif isinstance(cls_arg, list): + cls_arg_value = list() + cls_arg_e = str() if not cls_arg else cls_arg[0] + if arg_value is not None: + for a_v in arg_value: + cls_arg_value.append(DictRecursive.parse_single_arg(cls_arg_e, a_v, shared_dict)) + elif isinstance(cls_arg, dict): + if arg_value is not None: + cls_arg_value = dict() + for a_v in arg_value: + cls_arg_value[a_v] = arg_value[a_v] + else: + cls_arg_value = cls_arg + elif isinstance(cls_arg, np.ndarray): + if arg_value is not None: + cls_arg_value = np.asarray(arg_value, cls_arg.dtype) + else: + cls_arg_value = cls_arg + elif issubclass(type(cls_arg), DictRecursive): + cls_arg_value = type(cls_arg)() + cls_arg_value.load(arg_value, shared_dict) + else: + raise NotImplementedError + return cls_arg_value + + def match_function_args(self, external_dict, target_func): + args_dict = copy.deepcopy(external_dict) + for func_key in inspect.signature(target_func).parameters.keys(): + if func_key not in self.__dict__.keys(): + continue + if func_key in args_dict.keys(): + continue + args_dict[func_key] = self.__dict__[func_key] + return args_dict + + +class ProcessUnit(DictRecursive): + """ + Pipeline units + """ + def __init__(self): + super().__init__() + self.assemble_function = str() + self.name = str() + self.stride = 1 + self.attrs = dict() + self.in_paths = list() + self.out_paths = list() + + +class EntryConfig(DictRecursive): + """ + Main entry of each task + """ + def __init__(self): + super().__init__() + self.assemble_class = str() + self.process_pipelines = list([ProcessUnit()]) + + +class EnvsConfig(DictRecursive): + """ + Global environments + """ + def __init__(self): + super().__init__() + self.in_data_root = str() + self.out_data_root = str() + self.io_paths: Dict[str, str] = dict() + + def get_env_path(self, env_name: str): + """ + Get the absolute folder path by the env name + """ + if 'in_data_root' not in self.io_paths: + self.io_paths['in_data_root'] = self.in_data_root + self.io_paths['out_data_root'] = self.out_data_root + self.io_paths = { + "points": os.path.join(self.out_data_root, "points"), + "semantic_mask": os.path.join(self.out_data_root, "semantic_mask"), + "instance": os.path.join(self.out_data_root, "instance"), + "annotations": os.path.join(self.out_data_root, "annotations"), + } + rel_path = self.io_paths[env_name] + while True: + regex_pattern = r'\$.*\$' + patterns = re.findall(regex_pattern, rel_path) + if not patterns: + break + rel_path = rel_path.replace(patterns[0], self.io_paths[patterns[0][1:-1]]) + return rel_path + + +class StreamingTasks(DictRecursive): + """ + Main entry of streaming tasks + """ + def __init__(self): + super().__init__() + self.envs = EnvsConfig() + self.streaming_lines = list([EntryConfig()]) + + +class EntryBase: + """ + The basic config of entry + """ + def __init__(self, proc_units: List[ProcessUnit], envs: EnvsConfig) -> None: + self.proc_units = proc_units + self.envs = envs + + def execute_pipeline(self): + """ + execute the data processing pipeline + """ + for proc_unit in self.proc_units: + proc_func = getattr(self, proc_unit.assemble_function) + proc_func(proc_unit) + + +class MPEntryBase(EntryBase): + """ + The multi-process config of entry + """ + def __init__(self, proc_units: List[ProcessUnit], envs: EnvsConfig) -> None: + super().__init__(proc_units, envs) + self._enable_mp = True + self._num_worker = 8 + + @abstractmethod + def _sample_list(self): + """ + Return the list of samples to be processed + """ + + def _execute_proc_unit(self, sample: str, proc_unit: ProcessUnit, shared_vars: Dict): + proc_func = getattr(self, proc_unit.assemble_function) + proc_func(sample, proc_unit, shared_vars) + + def _merged_cross_processing(self, ipc_vars): + """ + Merge all shared list information cross all processors + """ + + def _merged_within_processing(self, shared_vars, ipc_vars): + """ + Merge all information within a processor + """ + + def _mp_execute_pipeline(self, samples, ipc_vars: List, worker_offset=0, worker_id=0): + del worker_offset + logging.info(f'worker {worker_id} begin...') + shared_vars = dict() + for s_idx, sample in enumerate(samples): + for proc_unit in self.proc_units: + if s_idx % proc_unit.stride != 0: + continue + self._execute_proc_unit(sample, proc_unit, shared_vars) + self._merged_within_processing(shared_vars, ipc_vars) + + def multiple_processor(func, samples: List, workers, args: Tuple): + samples_per_worker = int((len(samples) - 1) / workers + 1) + processes = list() + for w in range(workers): + start_index = w * samples_per_worker + end_index = min((w + 1) * samples_per_worker, len(samples)) + f_args = (samples[start_index: end_index], ) + args + (start_index, w) + t = multiprocessing.Process(target=func, args=f_args) + processes.append(t) + t.start() + for p in processes: + p.join() + + def execute_pipeline(self): + logging.info(f'- Start to execute pipeline {self.__class__.__name__}') + samples = self._sample_list() + ipc_vars = mp.Manager().list() + if self._enable_mp: + __class__.multiple_processor(self._mp_execute_pipeline, samples, workers=8, \ + args=(ipc_vars, )) + else: + self._mp_execute_pipeline(samples, ipc_vars) + self._merged_cross_processing(list(ipc_vars)) + logging.info(f'- Finished to execute pipeline {self.__class__.__name__}') \ No newline at end of file diff --git a/prepare_data/structured3d/utils/label_mapping.txt b/prepare_data/structured3d/utils/label_mapping.txt new file mode 100644 index 0000000..593508b --- /dev/null +++ b/prepare_data/structured3d/utils/label_mapping.txt @@ -0,0 +1,40 @@ +1 wall +2 floor +3 cabinet +4 bed +5 chair +6 sofa +7 table +8 door +9 window +10 bookshelf +11 picture +12 counter +13 blinds +14 desk +15 shelves +16 curtain +17 dresser +18 pillow +19 mirror +20 floor mat +21 clothes +22 ceiling +23 books +24 refrigerator +25 television +26 paper +27 towel +28 shower curtain +29 box +30 whiteboard +31 person +32 nightstand +33 toilet +34 sink +35 lamp +36 bathtub +37 bag +38 otherstructure +39 otherfurniture +40 otherprop diff --git a/prepare_data/structured3d/utils/nyu_40.py b/prepare_data/structured3d/utils/nyu_40.py new file mode 100644 index 0000000..48410ba --- /dev/null +++ b/prepare_data/structured3d/utils/nyu_40.py @@ -0,0 +1,94 @@ +""" +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +""" +import os +from typing import Dict + + +class NYU40: + """ + NYU40 label definition and color scheme + """ + LABEL_DICT: Dict[str, int] = dict() + INDEX_DICT: Dict[int, str] = dict() + + @staticmethod + def load_dict(i2l: bool): + """ + Load global label dictionary + """ + if not __class__.LABEL_DICT: + label_path = os.path.join(os.path.dirname(os.path.abspath(\ + __file__)), 'label_mapping.txt') + with open(label_path, encoding='utf-8') as l_fp: + for line in l_fp.readlines(): + items = line.rstrip('\n').split('\t') + __class__.LABEL_DICT[items[-1]] = int(items[0]) + __class__.INDEX_DICT[int(items[0])] = items[-1] + return __class__.INDEX_DICT if i2l else __class__.LABEL_DICT + + @staticmethod + def label_to_index(label: str): + """ + Mapping index to label + """ + return __class__.load_dict(False)[label] + + @staticmethod + def index_to_label(index: int): + """ + Mapping index to label + """ + return __class__.load_dict(True)[index] + + @staticmethod + def color_scheme(): + """ + Get the color coding scheme + Source from: https://github.com/ScanNet/ScanNet/blob/master/BenchmarkScripts/util.py + Copyright: ScanNet + """ + return [ + (0, 0, 0), + (174, 199, 232), # wall + (152, 223, 138), # floor + (31, 119, 180), # cabinet + (255, 187, 120), # bed + (188, 189, 34), # chair + (140, 86, 75), # sofa + (255, 152, 150), # table + (214, 39, 40), # door + (197, 176, 213), # window + (148, 103, 189), # bookshelf + (196, 156, 148), # picture + (23, 190, 207), # counter + (178, 76, 76), + (247, 182, 210), # desk + (66, 188, 102), + (219, 219, 141), # curtain + (140, 57, 197), + (202, 185, 52), + (51, 176, 203), + (200, 54, 131), + (92, 193, 61), + (78, 71, 183), + (172, 114, 82), + (255, 127, 14), # refrigerator + (91, 163, 138), + (153, 98, 156), + (140, 153, 101), + (158, 218, 229), # shower curtain + (100, 125, 154), + (178, 127, 135), + (120, 185, 128), + (146, 111, 194), + (44, 160, 44), # toilet + (112, 128, 144), # sink + (96, 207, 209), + (227, 119, 194), # bathtub + (213, 92, 176), + (94, 106, 211), + (82, 84, 163), # other furn + (100, 85, 144) + ] \ No newline at end of file diff --git a/prepare_data/structured3d/utils/s3dutilize.py b/prepare_data/structured3d/utils/s3dutilize.py new file mode 100644 index 0000000..169c870 --- /dev/null +++ b/prepare_data/structured3d/utils/s3dutilize.py @@ -0,0 +1,118 @@ +import numpy as np + +class Annotations: + """ + Annotation information + """ + def __init__(self) -> None: + self.gt_num = 0 + self.name = list() + self.location = list() + self.dimensions = list() + self.gt_boxes_upright_depth = list() + self.unaligned_location = list() + self.unaligned_dimensions = list() + self.unaligned_gt_boxes_upright_depth = list() + self.index = list() + self.classes = list() + self.axis_align_matrix = list() + + def dump(self): + """ + Dump information into dict + """ + anno_dict = dict() + anno_dict['gt_num'] = int(self.gt_num) + anno_dict['name'] = np.asarray(self.name) + anno_dict['location'] = np.asarray(self.location, dtype=np.float64) + anno_dict['dimensions'] = np.asarray(self.dimensions, dtype=np.float64) + anno_dict['gt_boxes_upright_depth'] = np.asarray(self.gt_boxes_upright_depth, \ + dtype=np.float64) + anno_dict['unaligned_location'] = np.asarray(self.unaligned_location, \ + dtype=np.float64) + anno_dict['unaligned_dimensions'] = np.asarray(self.unaligned_dimensions, \ + dtype=np.float64) + anno_dict['unaligned_gt_boxes_upright_depth'] = np.asarray( + self.unaligned_gt_boxes_upright_depth, dtype=np.float64) + anno_dict['index'] = np.asarray(self.index, dtype=np.int32) + anno_dict['class'] = np.asarray(self.classes, dtype=np.int64) + anno_dict['axis_align_matrix'] = np.asarray(self.axis_align_matrix, dtype=np.float64) + return anno_dict + + +class S3DUtilize(object): + """ + Structured3D utilize functions + """ + @staticmethod + def get_fov_normal(image_size, cam_focal, norm=True): + """ + Get the normal FoV directions + """ + u2x, v2y = [(np.arange(1, image_size[a_i] + 1) - image_size[a_i] / 2) / cam_focal[a_i]\ + for a_i in [0, 1]] + cam_m_u2x = np.tile([u2x], (image_size[1], 1)) + cam_m_v2y = np.tile(v2y[:, np.newaxis], (1, image_size[0])) + cam_m_depth = np.ones(image_size).T + fov_normal = np.stack((cam_m_depth, -1 * cam_m_v2y, cam_m_u2x), axis=-1) + if norm: + fov_normal = fov_normal / np.sqrt(np.sum(np.square(fov_normal), axis=-1, keepdims=True)) + return fov_normal + + @staticmethod + def cast_perspective_to_local_coord(depth_img: np.ndarray, fov_normal): + """ + Cast the perspective image into 3D coordinate system + """ + return depth_img * fov_normal + + @staticmethod + def cast_points_to_voxel(points, labels, room_size=(6.4, 3.2, 6.4), room_stride=0.2): + """ + Voxelize the points + """ + vol_resolution = (np.asarray(room_size) / room_stride).astype(np.int32) + vol_index = np.floor(points / room_stride).astype(np.int32) + in_vol = np.logical_and(np.all(vol_index < vol_resolution, axis=1), \ + np.all(vol_index >= 0, axis=1)) + v_x, v_y, v_z = [d_[..., 0] for d_ in np.split(vol_index[in_vol], 3, axis=-1)] + vol_label = labels[in_vol] + vol_data = np.zeros(vol_resolution, dtype=np.uint8) + vol_data[v_x, v_y, v_z] = vol_label + return vol_data + + @staticmethod + def get_rotation_matrix_from_tu(cam_front, cam_up): + """ + Get the rotation matrix from TU-coords + """ + cam_n = np.cross(cam_front, cam_up) + cam_m = np.stack((cam_front, cam_up, cam_n), axis=1).astype(np.float32) + return cam_m + + @staticmethod + def get_8points_bounding_box(basis, coeffs, centroid): + """ + Get the 8 corners from the bounding box parameters + """ + corners = np.zeros((8, 3)) + coeffs = np.abs(coeffs) + corners[0, :] = -basis[0, :] * coeffs[0] + basis[1, :] * \ + coeffs[1] + basis[2, :] * coeffs[2] + corners[1, :] = basis[0, :] * coeffs[0] + basis[1, :] * \ + coeffs[1] + basis[2, :] * coeffs[2] + corners[2, :] = basis[0, :] * coeffs[0] + -basis[1, :] * \ + coeffs[1] + basis[2, :] * coeffs[2] + corners[3, :] = -basis[0, :] * coeffs[0] + -basis[1, :] * \ + coeffs[1] + basis[2, :] * coeffs[2] + + corners[4, :] = -basis[0, :] * coeffs[0] + basis[1, :] * \ + coeffs[1] + -basis[2, :] * coeffs[2] + corners[5, :] = basis[0, :] * coeffs[0] + basis[1, :] * \ + coeffs[1] + -basis[2, :] * coeffs[2] + corners[6, :] = basis[0, :] * coeffs[0] + -basis[1, :] * \ + coeffs[1] + -basis[2, :] * coeffs[2] + corners[7, :] = -basis[0, :] * coeffs[0] + -basis[1, :] * \ + coeffs[1] + -basis[2, :] * coeffs[2] + corners = corners + np.tile(centroid, (8, 1)) + return corners diff --git a/preprocess/build.py b/preprocess/build.py index 551d97f..fb3445e 100644 --- a/preprocess/build.py +++ b/preprocess/build.py @@ -3,5 +3,6 @@ PROCESSOR_REGISTRY = Registry("Processor") def build_processor(processor_name, data_config, modality_config, split): + print(f"Building processor: {processor_name}") processor = PROCESSOR_REGISTRY.get(processor_name)(data_config, modality_config, split) return processor \ No newline at end of file diff --git a/preprocess/calculate_color_stats.py b/preprocess/calculate_color_stats.py new file mode 100644 index 0000000..ae1d2bb --- /dev/null +++ b/preprocess/calculate_color_stats.py @@ -0,0 +1,222 @@ +import numpy as np +import os +import os.path as osp +import yaml +from tqdm import tqdm +import argparse +from omegaconf import DictConfig, OmegaConf +from pathlib import Path + +def load_config(config_path): + """Load and resolve the YAML config file.""" + config = OmegaConf.load(config_path) + # Resolve variable substitutions + config = OmegaConf.to_container(config, resolve=True) + return config + +def get_train_scan_ids(dataset_name, base_dir): + """Get train scan IDs for each dataset based on their file structure.""" + train_scan_ids = [] + + if dataset_name.lower() == 'scannet': + train_file = osp.join(base_dir, 'files', 'scannetv2_train.txt') + if osp.exists(train_file): + with open(train_file, 'r') as f: + train_scan_ids = [line.strip() for line in f.readlines()] + else: + print(f"Warning: Train split file not found for ScanNet: {train_file}") + + elif dataset_name.lower() == 'scan3r': + train_file = osp.join(base_dir, 'files', 'train_scans.txt') + if osp.exists(train_file): + with open(train_file, 'r') as f: + train_scan_ids = [line.strip() for line in f.readlines()] + else: + print(f"Warning: Train split file not found for Scan3R: {train_file}") + + elif dataset_name.lower() == 'multiscan': + train_file = osp.join(base_dir, 'files', 'train_scans.txt') + if osp.exists(train_file): + with open(train_file, 'r') as f: + train_scan_ids = [line.strip() for line in f.readlines()] + else: + print(f"Warning: Train split file not found for MultiScan: {train_file}") + + elif dataset_name.lower() == 'arkitscenes': + train_file = osp.join(base_dir, 'files', 'train_scans.txt') + if osp.exists(train_file): + with open(train_file, 'r') as f: + train_scan_ids = [line.strip() for line in f.readlines()] + else: + print(f"Warning: Train split file not found for ARKitScenes: {train_file}") + + elif dataset_name.lower() == 'structured3d': + train_file = osp.join(base_dir, 'files', 'train_scans.txt') + if osp.exists(train_file): + with open(train_file, 'r') as f: + train_scan_ids = [line.strip() for line in f.readlines()] + else: + print(f"Warning: Train split file not found for Structured3D: {train_file}") + + return train_scan_ids + +def compute_color_stats_for_dataset(dataset_name, dataset_config, train_scan_ids=None): + """ + Compute color statistics for a specific dataset. + + Args: + dataset_name: Name of the dataset (e.g., 'Scannet', 'Scan3R') + dataset_config: Dataset configuration from YAML + train_scan_ids: List of train scan IDs (optional) + """ + + process_dir = dataset_config['process_dir'] + base_dir = dataset_config['base_dir'] + + print(f"\n{'='*60}") + print(f"PROCESSING DATASET: {dataset_name}") + print(f"{'='*60}") + print(f"Base directory: {base_dir}") + print(f"Process directory: {process_dir}") + + # Get train scan IDs if not provided + if train_scan_ids is None: + train_scan_ids = get_train_scan_ids(dataset_name, base_dir) + + print(f"Total train scans to process: {len(train_scan_ids)}") + + # Filter to only existing processed scans + valid_train_scans = [] + for scan_id in train_scan_ids: + data_path = osp.join(process_dir, 'scans', scan_id, 'data3D.npz') + if osp.exists(data_path): + valid_train_scans.append(scan_id) + + if len(valid_train_scans) == 0: + print(f"No valid processed train scans found for {dataset_name}") + return None + + # Collect color statistics + all_color_means = [] + all_color_second_moments = [] + + + for scan_id in tqdm(valid_train_scans, desc=f"Processing {dataset_name}"): + try: + data_path = osp.join(process_dir, 'scans', scan_id, 'data3D.npz') + data = np.load(data_path, allow_pickle=True) + + # Handle different data structures + scene_data = data['scene'].item() + mesh_colors = scene_data['pcl_feats'] + + # Normalize colors to [0, 1] range + colors_normalized = mesh_colors / 255.0 + + # Compute per-scan statistics + color_mean = colors_normalized.mean(axis=0) # E[X] + color_second_moment = (colors_normalized ** 2).mean(axis=0) # E[X²] + + all_color_means.append(color_mean) + all_color_second_moments.append(color_second_moment) + + except Exception as e: + print(f"Error processing {scan_id}: {e}") + continue + + if len(all_color_means) == 0: + print(f"No valid color data found for {dataset_name}") + return None + + # Compute global statistics + global_color_mean = np.array(all_color_means).mean(axis=0) + global_color_second_moment = np.array(all_color_second_moments).mean(axis=0) + global_color_std = np.sqrt(global_color_second_moment - global_color_mean**2) + + # Prepare output + color_mean_std = { + 'mean': [float(val) for val in global_color_mean], + 'std': [float(val) for val in global_color_std] + } + + # Save to dataset's process directory + output_path = osp.join(process_dir, 'color_mean_std.yaml') + os.makedirs(process_dir, exist_ok=True) + + with open(output_path, 'w') as f: + yaml.dump(color_mean_std, f, default_flow_style=False, indent=2) + + print(f"\nStatistics computed from {len(all_color_means)} train scans:") + print(f"Mean (RGB): [{global_color_mean[0]:.6f}, {global_color_mean[1]:.6f}, {global_color_mean[2]:.6f}]") + print(f"Std (RGB): [{global_color_std[0]:.6f}, {global_color_std[1]:.6f}, {global_color_std[2]:.6f}]") + print(f"Saved to: {output_path}") + + return color_mean_std + +def main(): + parser = argparse.ArgumentParser(description="Compute color statistics for all datasets from process_3d.yaml config") + parser.add_argument("--config-path", type=str, + default="/Users/gauravpradeep/CrossOver_ScaleUp/configs/preprocess", + help="Path to config directory") + parser.add_argument("--config-name", type=str, + default="process_3d.yaml", + help="Config file name") + parser.add_argument("--datasets", type=str, nargs="*", + help="Specific datasets to process (e.g., Scannet Scan3R). If not specified, processes all.") + + args = parser.parse_args() + + # Load configuration + config_file = osp.join(args.config_path, args.config_name) + if not osp.exists(config_file): + print(f"Config file not found: {config_file}") + return + + config = load_config(config_file) + + data_config = config['data'] + available_datasets = [] + + for key, value in data_config.items(): + if key.lower() == 'front3d': + continue + if isinstance(value, dict) and 'process_dir' in value and 'base_dir' in value: + available_datasets.append(key) + + if args.datasets: + datasets_to_process = [d for d in args.datasets if d in available_datasets] + if not datasets_to_process: + print(f"None of the specified datasets found in config: {args.datasets}") + return + else: + datasets_to_process = available_datasets + + print(f"Processing datasets: {datasets_to_process}") + + # Process each dataset + results = {} + for dataset_name in datasets_to_process: + dataset_config = data_config[dataset_name] + + try: + result = compute_color_stats_for_dataset(dataset_name, dataset_config) + results[dataset_name] = result + except Exception as e: + print(f"Failed to process {dataset_name}: {e}") + results[dataset_name] = None + + # Summary + print(f"\n{'='*80}") + print(f"PROCESSING SUMMARY") + print(f"{'='*80}") + + for dataset_name, result in results.items(): + if result: + print(f"✓ {dataset_name}: Successfully computed color statistics") + else: + print(f"✗ {dataset_name}: Failed to compute color statistics") + + print(f"{'='*80}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/preprocess/feat1D/__init__.py b/preprocess/feat1D/__init__.py index 9a1b744..0e3df30 100644 --- a/preprocess/feat1D/__init__.py +++ b/preprocess/feat1D/__init__.py @@ -1,2 +1,5 @@ from .scannet import * -from .scan3r import * \ No newline at end of file +from .scan3r import * +from .arkit import * +from .multiscan import * +from .structured3d import * diff --git a/preprocess/feat1D/arkit.py b/preprocess/feat1D/arkit.py new file mode 100644 index 0000000..d02c23b --- /dev/null +++ b/preprocess/feat1D/arkit.py @@ -0,0 +1,102 @@ +import os.path as osp +import numpy as np +from common import load_utils +from util import arkit +from util.arkit import ARKITSCENE_SCANNET +from preprocess.build import PROCESSOR_REGISTRY +from preprocess.feat1D.base import Base1DProcessor + +@PROCESSOR_REGISTRY.register() +class ARKitScenes1DProcessor(Base1DProcessor): + def __init__(self, config_data, config_1D, split) -> None: + super(ARKitScenes1DProcessor, self).__init__(config_data, config_1D, split) + self.data_dir = config_data.base_dir + + files_dir = osp.join(config_data.base_dir, 'files') + + self.scan_ids = [] + self.scan_ids = arkit.get_scan_ids(files_dir, split) + + self.out_dir = osp.join(config_data.process_dir, 'scans') + load_utils.ensure_dir(self.out_dir) + # Object Referrals + self.object_referrals = load_utils.load_json(osp.join(files_dir, 'sceneverse/ssg_ref_rel2_template.json')) + + # label map + self.label_map = arkit.read_label_map(files_dir, label_from = 'raw_category', label_to = 'nyu40id') + self.undefined = 0 + + + def load_objects_for_scan(self, scan_id): + """Load and parse the annotations JSON for the given scan ID.""" + objects_path = osp.join(self.data_dir, 'scans', scan_id, f"{scan_id}_3dod_annotation.json") + if not osp.exists(objects_path): + raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}") + + annotations = load_utils.load_json(objects_path) + + objects = [] + for _i, label_info in enumerate(annotations["data"]): + obj_label = label_info["label"] + object_id = _i + 1 + scannet_class=ARKITSCENE_SCANNET[obj_label] + nyu40id=self.label_map[scannet_class] + objects.append({ + "objectId": object_id, + "global_id": nyu40id + }) + + return objects + + def compute1DFeaturesEachScan(self, scan_id): + data1D = {} + + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + npz_data = load_utils.load_npz_as_dict(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz')) + objectID_to_labelID_map = npz_data['obj_id_to_label_id_map'] + + scan_objects = self.load_objects_for_scan(scan_id) + + object_referral_embeddings, scene_referral_embeddings = {}, None + if len(scan_objects) != 0: + object_referral_embeddings = self.computeObjectWise1DFeaturesEachScan(scan_id, scan_objects, objectID_to_labelID_map) + + scene_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id] + + if len(scene_referrals) != 0: + if len(scene_referrals) > 10: + scene_referrals = np.random.choice(scene_referrals, size=10, replace=False) + + scene_referrals = [scene_referral['utterance'] for scene_referral in scene_referrals] + scene_referrals = ' '.join(scene_referrals) + scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True) + assert scene_referral_embeddings is not None + + data1D['objects'] = {'referral_embeddings' : object_referral_embeddings} + data1D['scene'] = {'referral_embedding': scene_referral_embeddings} + np.savez_compressed(osp.join(scene_out_dir, 'data1D.npz'), **data1D) + + def computeObjectWise1DFeaturesEachScan(self, scan_id, scan_objects, objectID_to_labelID_map): + object_referral_embeddings = {} + + scan_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id] + + for idx, scan_object in enumerate(scan_objects): + instance_id = int(scan_object['objectId']) + + if instance_id not in objectID_to_labelID_map.keys(): + continue + + # Object Referral + object_referral = [referral['utterance'] for referral in scan_referrals if int(referral['target_id']) == instance_id] + if len(object_referral) != 0: + object_referral_feats = self.extractTextFeats(object_referral) + if object_referral_feats is not None: + object_referral_feats = np.mean(object_referral_feats, axis = 0).reshape(1, -1) + assert object_referral_feats.shape == (1, self.embed_dim) + + object_referral_embeddings[instance_id] = {'referral' : object_referral, 'feats' : object_referral_feats} + + return object_referral_embeddings \ No newline at end of file diff --git a/preprocess/feat1D/multiscan.py b/preprocess/feat1D/multiscan.py new file mode 100644 index 0000000..eb64243 --- /dev/null +++ b/preprocess/feat1D/multiscan.py @@ -0,0 +1,98 @@ +import os.path as osp +import numpy as np +from common import load_utils +from util import multiscan + +from preprocess.build import PROCESSOR_REGISTRY +from preprocess.feat1D.base import Base1DProcessor + +@PROCESSOR_REGISTRY.register() +class MultiScan1DProcessor(Base1DProcessor): + def __init__(self, config_data, config_1D, split) -> None: + super(MultiScan1DProcessor, self).__init__(config_data, config_1D, split) + self.data_dir = config_data.base_dir + + files_dir = osp.join(config_data.base_dir, 'files') + + self.scan_ids = [] + self.scan_ids = multiscan.get_scan_ids(files_dir, split) + + self.out_dir = osp.join(config_data.process_dir, 'scans') + load_utils.ensure_dir(self.out_dir) + # Object Referrals + self.object_referrals = load_utils.load_json(osp.join(files_dir, 'sceneverse/ssg_ref_rel2_template.json')) + + # label map + self.undefined = 0 + + def load_objects_for_scan(self, scan_id): + """Load and parse the annotations JSON for the given scan ID.""" + objects_path = osp.join(self.data_dir, 'scenes', scan_id, f"{scan_id}.annotations.json") + if not osp.exists(objects_path): + raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}") + + annotations = load_utils.load_json(objects_path) + objects = [] + + for obj in annotations["objects"]: + objects.append({ + "objectId": obj["objectId"], + "global_id": obj.get("label") + }) + + return objects + + + def compute1DFeaturesEachScan(self, scan_id): + data1D = {} + + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + npz_data = load_utils.load_npz_as_dict(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz')) + objectID_to_labelID_map = npz_data['obj_id_to_label_id_map'] + + scan_objects = self.load_objects_for_scan(scan_id) + + object_referral_embeddings, scene_referral_embeddings = {}, None + if len(scan_objects) != 0: + object_referral_embeddings = self.computeObjectWise1DFeaturesEachScan(scan_id, scan_objects, objectID_to_labelID_map) + + scene_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id] + + if len(scene_referrals) != 0: + if len(scene_referrals) > 10: + scene_referrals = np.random.choice(scene_referrals, size=10, replace=False) + + scene_referrals = [scene_referral['utterance'] for scene_referral in scene_referrals] + scene_referrals = ' '.join(scene_referrals) + scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True) + assert scene_referral_embeddings is not None + + data1D['objects'] = {'referral_embeddings' : object_referral_embeddings} + data1D['scene'] = {'referral_embedding': scene_referral_embeddings} + + np.savez_compressed(osp.join(scene_out_dir, 'data1D.npz'), **data1D) + + def computeObjectWise1DFeaturesEachScan(self, scan_id, scan_objects, objectID_to_labelID_map): + object_referral_embeddings = {} + + scan_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id] + + for idx, scan_object in enumerate(scan_objects): + instance_id = int(scan_object['objectId']) + + if instance_id not in objectID_to_labelID_map.keys(): + continue + + # Object Referral + object_referral = [referral['utterance'] for referral in scan_referrals if int(referral['target_id']) == instance_id] + if len(object_referral) != 0: + object_referral_feats = self.extractTextFeats(object_referral) + if object_referral_feats is not None: + object_referral_feats = np.mean(object_referral_feats, axis = 0).reshape(1, -1) + assert object_referral_feats.shape == (1, self.embed_dim) + + object_referral_embeddings[instance_id] = {'referral' : object_referral, 'feats' : object_referral_feats} + + return object_referral_embeddings \ No newline at end of file diff --git a/preprocess/feat1D/scan3r.py b/preprocess/feat1D/scan3r.py index 65fb6e9..0c84043 100644 --- a/preprocess/feat1D/scan3r.py +++ b/preprocess/feat1D/scan3r.py @@ -1,5 +1,4 @@ import os.path as osp -import torch import numpy as np from common import load_utils from util import scan3r @@ -32,10 +31,12 @@ def __init__(self, config_data, config_1D, split) -> None: self.undefined = 0 def compute1DFeaturesEachScan(self, scan_id: str) -> None: + data1D = {} scene_out_dir = osp.join(self.out_dir, scan_id) load_utils.ensure_dir(scene_out_dir) - objectID_to_labelID_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] + npz_data = load_utils.load_npz_as_dict(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz')) + objectID_to_labelID_map = npz_data['obj_id_to_label_id_map'] scan_objects = [obj_data for obj_data in self.objects if obj_data['scan'] == scan_id][0]['objects'] object_referral_embeddings, scene_referral_embeddings = {}, None @@ -53,11 +54,12 @@ def compute1DFeaturesEachScan(self, scan_id: str) -> None: scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True) assert scene_referral_embeddings is not None - data1D = {} data1D['objects'] = {'referral_embeddings' : object_referral_embeddings} data1D['scene'] = {'referral_embedding': scene_referral_embeddings} + + # torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt')) + np.savez_compressed(osp.join(scene_out_dir, 'data1D.npz'), **data1D) - torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt')) def computeObjectWise1DFeaturesEachScan(self, scan_id: str, scan_objects: Dict, objectID_to_labelID_map: Dict[int, int]) -> Dict[int, Dict[str, Union[List[str], np.ndarray]]]: diff --git a/preprocess/feat1D/scannet.py b/preprocess/feat1D/scannet.py index e49b8e0..11a94cb 100644 --- a/preprocess/feat1D/scannet.py +++ b/preprocess/feat1D/scannet.py @@ -1,7 +1,5 @@ import os.path as osp -import torch import numpy as np - from common import load_utils from util import scannet from typing import Dict, List, Union @@ -34,10 +32,12 @@ def __init__(self, config_data, config_1D, split) -> None: self.undefined = 0 def compute1DFeaturesEachScan(self, scan_id: str) -> None: + data1D = {} scene_out_dir = osp.join(self.out_dir, scan_id) load_utils.ensure_dir(scene_out_dir) - objectID_to_labelID_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] + npz_data = load_utils.load_npz_as_dict(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz')) + objectID_to_labelID_map = npz_data['obj_id_to_label_id_map'] objects = [objects['objects'] for objects in self.objects if objects['scan'] == scan_id] object_referral_embeddings, scene_referral_embeddings = {}, None @@ -55,11 +55,10 @@ def compute1DFeaturesEachScan(self, scan_id: str) -> None: scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True) assert scene_referral_embeddings is not None - data1D = {} data1D['objects'] = {'referral_embeddings' : object_referral_embeddings} data1D['scene'] = {'referral_embedding': scene_referral_embeddings} - - torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt')) + + np.savez_compressed(osp.join(scene_out_dir, 'data1D.npz'), **data1D) def computeObjectWise1DFeaturesEachScan(self, scan_id: str, objects: Dict, objectID_to_labelID_map: Dict[int, int]) -> Dict[int, Dict[str, Union[List[str], np.ndarray]]]: diff --git a/preprocess/feat1D/structured3d.py b/preprocess/feat1D/structured3d.py new file mode 100644 index 0000000..0a1084a --- /dev/null +++ b/preprocess/feat1D/structured3d.py @@ -0,0 +1,135 @@ +import os.path as osp +import torch +import numpy as np +from tqdm import tqdm + +from common import load_utils +from util import structured3d +from util.structured3d import S3D_SCANNET +from preprocess.build import PROCESSOR_REGISTRY +from preprocess.feat1D.base import Base1DProcessor + + +@PROCESSOR_REGISTRY.register() +class Structured3D_1DProcessor(Base1DProcessor): + def __init__(self, config_data, config_1D, split) -> None: + super(Structured3D_1DProcessor, self).__init__(config_data, config_1D, split) + self.data_dir = config_data.base_dir + + files_dir = osp.join(config_data.base_dir, 'files') + + self.scan_ids = [] + self.scan_ids = structured3d.get_scan_ids(files_dir, split) + + self.out_dir = config_data.process_dir + load_utils.ensure_dir(self.out_dir) + # Object Referrals + self.object_referrals = load_utils.load_json(osp.join(files_dir, 'sceneverse/ssg_ref_rel2_template.json')) + + + def compute1DFeaturesEachScan(self, scan_id): + full_scan_id = scan_id + scan_id = scan_id.split('_') + room_id = scan_id[-1] + scan_id = scan_id[0]+'_'+scan_id[1] + obj2tgtid_map = load_utils.load_json(osp.join(self.data_dir,'scans',scan_id,'2D_rendering',room_id,'obj2tgid.json')) + + scene_out_dir = osp.join(self.out_dir, full_scan_id) + load_utils.ensure_dir(scene_out_dir) + objectID_to_labelID_map = load_utils.load_npz_as_dict(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'))['obj_id_to_label_id_map'] + + + object_referral_embeddings, scene_referral_embeddings = {}, None + if len(objectID_to_labelID_map.keys()) != 0: + object_referral_embeddings = self.computeObjectWise1DFeaturesEachScan(full_scan_id, objectID_to_labelID_map, obj2tgtid_map) + + scene_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == full_scan_id] + + if len(scene_referrals) != 0: + if len(scene_referrals) > 10: + scene_referrals = np.random.choice(scene_referrals, size=10, replace=False) + + scene_referrals = [scene_referral['utterance'] for scene_referral in scene_referrals] + scene_referrals = ' '.join(scene_referrals) + scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True) + assert scene_referral_embeddings is not None + + data1D = {} + data1D['objects'] = {'referral_embeddings' : object_referral_embeddings} + data1D['scene'] = {'referral_embedding': scene_referral_embeddings} + + # torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt')) + np.savez_compressed(osp.join(scene_out_dir, 'data1D.npz'), **data1D) + + def computeObjectWise1DFeaturesEachScan(self, scan_id, objectID_to_labelID_map, obj2tgtid): + object_referral_embeddings = {} + matched_objids=[] + scan_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id] + + for instance_id in objectID_to_labelID_map.keys(): + if str(instance_id) not in obj2tgtid.keys(): + # print(f"Instance ID {instance_id} not found in obj2tgtid mapping for scan {scan_id}. Skipping...") + continue + mapped_obj_id = obj2tgtid[str(instance_id)] + nyu40id= objectID_to_labelID_map[instance_id] + if nyu40id==0: + continue + label = S3D_SCANNET[nyu40id] + object_referral = [] + for referral in scan_referrals: + if int(referral['target_id']) == int(mapped_obj_id): + if referral['instance_type'] == label: + # print(referral['utterance']) + matched_objids.append(instance_id) + # print(scan_id,label,referral['instance_type'],referral['target_id'],mapped_obj_id) + object_referral.append(referral['utterance']) + # else: + # print(scan_id,label,referral['instance_type'],referral['target_id'],mapped_obj_id) + + if len(object_referral) != 0: + # print(scan_id,instance_id,len(object_referral)) + object_referral_feats = self.extractTextFeats(object_referral) + if object_referral_feats is not None: + object_referral_feats = np.mean(object_referral_feats, axis = 0).reshape(1, -1) + assert object_referral_feats.shape == (1, self.embed_dim) + + object_referral_embeddings[instance_id] = {'referral' : object_referral, 'feats' : object_referral_feats} + + # finding unmatched referrals + unmatched_referrals = [] + for referral in scan_referrals: + mapped_obj_id = referral['target_id'] + if int(mapped_obj_id) not in [int(obj2tgtid[str(instance_id)]) for instance_id in objectID_to_labelID_map.keys() if str(instance_id) in obj2tgtid]: + unmatched_referrals.append(referral) + elif any(int(mapped_obj_id) == int(obj2tgtid[str(instance_id)]) and S3D_SCANNET[objectID_to_labelID_map[instance_id]] != referral['instance_type'] + for instance_id in objectID_to_labelID_map.keys() if str(instance_id) in obj2tgtid and objectID_to_labelID_map[instance_id] != 0): + unmatched_referrals.append(referral) + + label_to_instances = {} + for instance_id, nyu40id in objectID_to_labelID_map.items(): + if nyu40id == 0: + continue + label = S3D_SCANNET[nyu40id] + if label not in label_to_instances: + label_to_instances[label] = [] + label_to_instances[label].append(instance_id) + + for referral in unmatched_referrals: + instance_type = referral['instance_type'] + if instance_type in label_to_instances and len(label_to_instances[instance_type]) == 1: + instance_id = label_to_instances[instance_type][0] + if instance_id not in matched_objids: + # print(f"Matching unmatched referral to unique instance: {scan_id},{instance_id}, {instance_type}, {referral['target_id']}") + if instance_id not in object_referral_embeddings: + object_referral = [referral['utterance']] + else: + object_referral_embeddings[instance_id]['referral'].append(referral['utterance']) + + object_referral_feats = self.extractTextFeats(object_referral) + if object_referral_feats is not None: + object_referral_feats = np.mean(object_referral_feats, axis=0).reshape(1, -1) + object_referral_embeddings[instance_id] = {'referral': object_referral, 'feats': object_referral_feats} + + + # print(object_referral_embeddings.keys()) + return object_referral_embeddings diff --git a/preprocess/feat2D/__init__.py b/preprocess/feat2D/__init__.py index 9a1b744..0e3df30 100644 --- a/preprocess/feat2D/__init__.py +++ b/preprocess/feat2D/__init__.py @@ -1,2 +1,5 @@ from .scannet import * -from .scan3r import * \ No newline at end of file +from .scan3r import * +from .arkit import * +from .multiscan import * +from .structured3d import * diff --git a/preprocess/feat2D/arkit.py b/preprocess/feat2D/arkit.py new file mode 100644 index 0000000..a2d02a8 --- /dev/null +++ b/preprocess/feat2D/arkit.py @@ -0,0 +1,233 @@ +import os.path as osp +import open3d as o3d +import numpy as np +import torch +from tqdm import tqdm +import shutil +from PIL import Image +from scipy.spatial.transform import Rotation as R +from omegaconf import DictConfig +from typing import List, Dict, Tuple +import pandas as pd +from common import load_utils +from util import render, arkit, visualisation +from util import image as image_util +import os + +from preprocess.build import PROCESSOR_REGISTRY +from preprocess.feat2D.base import Base2DProcessor + +@PROCESSOR_REGISTRY.register() +class ARKitScenes2DProcessor(Base2DProcessor): + """ARKitScenes 2D (RGB) feature processor class.""" + def __init__(self, config_data: DictConfig, config_2D: DictConfig, split: str) -> None: + super(ARKitScenes2DProcessor, self).__init__(config_data, config_2D, split) + self.data_dir = config_data.base_dir + files_dir = osp.join(config_data.base_dir, 'files') + + self.scan_ids = [] + self.split = split + self.scan_ids = arkit.get_scan_ids(files_dir, self.split) + + self.out_dir = osp.join(config_data.process_dir, 'scans') + load_utils.ensure_dir(self.out_dir) + + self.orig_image_size = config_2D.image.orig_size + self.model_image_size = config_2D.image.model_size + + self.frame_skip = config_data.skip_frames + self.top_k = config_2D.image.top_k + self.num_levels = config_2D.image.num_levels + self.undefined = 0 + self.metadata = pd.read_csv(osp.join(files_dir,'metadata.csv')) + + self.frame_pose_data = {} + for scan_id in self.scan_ids: + pose_data = arkit.load_poses(osp.join(self.data_dir, 'scans', scan_id),scan_id, skip=self.frame_skip) + self.frame_pose_data[scan_id] = pose_data + + def compute2DFeatures(self) -> None: + for scan_id in tqdm(self.scan_ids): + self.compute2DImagesAndSeg(scan_id) + self.compute2DFeaturesEachScan(scan_id) + + def compute2DImagesAndSeg(self, scan_id: str) -> None: + obj_id_imgs = {} + + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + objects_path = osp.join(self.data_dir, 'scans', scan_id, f"{scan_id}_3dod_annotation.json") + if not osp.exists(objects_path): + raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}") + + annotations = load_utils.load_json(objects_path) + ply_data = arkit.load_ply_data(osp.join(self.data_dir,'scans'), scan_id, annotations) + instance_ids = ply_data['objectId'] + + mesh_file = osp.join(self.data_dir, 'scans', scan_id, f'{scan_id}_3dod_mesh.ply') + mesh = o3d.io.read_triangle_mesh(mesh_file) + mesh_triangles = np.asarray(mesh.triangles) + colors = np.asarray(mesh.vertex_colors)*255.0 + colors = colors.round() + num_triangles = mesh_triangles.shape[0] + + scene = o3d.t.geometry.RaycastingScene() + scene.add_triangles(o3d.t.geometry.TriangleMesh.from_legacy(mesh)) + + # project 3D model + for frame_idx in self.frame_pose_data[scan_id].keys(): + camera_info = arkit.load_intrinsics(osp.join(self.data_dir,'scans'),scan_id,frame_idx) + intrinsics = camera_info['intrinsic_mat'] + img_width = int(camera_info['width']) + img_height = int(camera_info['height']) + img_pose = self.frame_pose_data[scan_id][frame_idx] + img_pose_inv = np.linalg.inv(img_pose) + + obj_id_map = render.project_mesh3DTo2D_with_objectseg( + scene, intrinsics, img_pose_inv, img_width, img_height, + mesh_triangles, num_triangles, instance_ids + ) + obj_id_imgs[frame_idx] = obj_id_map + + np.savez_compressed(osp.join(scene_out_dir,'gt-projection-seg.npz'),**obj_id_imgs) + + def compute2DFeaturesEachScan(self, scan_id: str) -> None: + data2D = {} + + scene_folder = osp.join(self.data_dir, 'scans', scan_id) + color_path = osp.join(scene_folder,f'{scan_id}_frames', 'lowres_wide') + + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + obj_id_to_label_id_map = load_utils.load_npz_as_dict(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'))['obj_id_to_label_id_map'] + + # Multi-view Image -- Object (Embeddings) + object_image_embeddings, object_image_votes_topK, frame_idxs = self.computeImageFeaturesAllObjectsEachScan(scene_folder, scene_out_dir, obj_id_to_label_id_map) + + # Multi-view Image -- Scene (Images + Embeddings) + frame_idxs = list(self.frame_pose_data[scan_id].keys()) + pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs = self.computeSelectedImageFeaturesEachScan(scan_id, color_path, frame_idxs) + + # Visualise + for frame_idx in self.frame_pose_data[scan_id].keys(): + camera_info = arkit.load_intrinsics(osp.join(self.data_dir,'scans'),scan_id,frame_idx) + intrinsic_mat = camera_info['intrinsic_mat'] + break + + scene_mesh = o3d.io.read_triangle_mesh(osp.join(scene_folder, f'{scan_id}_3dod_mesh.ply')) + intrinsics = { 'f' : intrinsic_mat[0, 0], 'cx' : intrinsic_mat[0, 2], 'cy' : intrinsic_mat[1, 2], + 'w' : int(camera_info['width']), 'h' : int(camera_info['height'])} + + cams_visualised_on_mesh = visualisation.visualise_camera_on_mesh(scene_mesh, pose_data[sampled_frame_idxs], intrinsics, stride=1) + image_path = osp.join(scene_out_dir, 'sel_cams_on_mesh.png') + Image.fromarray((cams_visualised_on_mesh * 255).astype(np.uint8)).save(image_path) + + data2D['objects'] = {'image_embeddings': object_image_embeddings, 'topK_images_votes' : object_image_votes_topK} + data2D['scene'] = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, + 'frame_idxs' : frame_idxs, 'sampled_cam_idxs' : sampled_frame_idxs} + + # dummy floorplan + floorplan_dict = {'img' : None, 'embedding' : None} + data2D['scene']['floorplan'] = floorplan_dict + + np.savez_compressed(osp.join(scene_out_dir, 'data2D.npz'), **data2D) + + def computeSelectedImageFeaturesEachScan(self, scan_id: str, color_path: str, frame_idxs: List[int]) -> Tuple[np.ndarray, List[torch.tensor], np.ndarray, List[int]]: + # Sample Camera Indexes Based on Rotation Matrix From Grid + pose_data = [] + for frame_idx in frame_idxs: + pose = self.frame_pose_data[scan_id][frame_idx] + rot_quat = R.from_matrix(pose[:3, :3]).as_quat() + trans = pose[:3, 3] + pose_data.append([trans[0], trans[1], trans[2], rot_quat[0], rot_quat[1], rot_quat[2], rot_quat[3]]) + + pose_data = np.array(pose_data) + + sampled_frame_idxs = image_util.sample_camera_pos_on_grid(pose_data) + # sky_direction=self.metadata[self.metadata['video_id']==int(scan_id)]['sky_direction'].values[0] + + # Extract Scene Image Features + scene_images_pt = [] + for idx in sampled_frame_idxs: + frame_index = frame_idxs[idx] + + image = Image.open(osp.join(color_path, f'{scan_id}_{frame_index}.png')) + image = image.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC) + image_pt = self.model.base_tf(image) + scene_images_pt.append(image_pt) + + scene_image_embeddings = self.extractFeatures(scene_images_pt, return_only_cls_mean= False) + + return pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs + + def computeImageFeaturesAllObjectsEachScan(self, scene_folder: str, scene_out_dir: str, obj_id_to_label_id_map: dict) -> Tuple[Dict[int, Dict[int, np.ndarray]], Dict[int, List[int]], List[str]]: + object_anno_2D = np.load(osp.join(scene_out_dir, 'gt-projection-seg.npz'),allow_pickle=True) + object_image_votes = {} + scan_id=scene_folder.split('/')[-1] + # iterate over all frames + for frame_idx in object_anno_2D: + obj_2D_anno_frame = object_anno_2D[frame_idx] + # process 2D anno + obj_ids, counts = np.unique(obj_2D_anno_frame, return_counts=True) + for idx in range(len(obj_ids)): + obj_id = obj_ids[idx] + count = counts[idx] + if obj_id == self.undefined: + continue + + if obj_id not in object_image_votes: + object_image_votes[obj_id] = {} + if frame_idx not in object_image_votes[obj_id]: + object_image_votes[obj_id][frame_idx] = 0 + object_image_votes[obj_id][frame_idx] = count + + # select top K frames for each obj + object_image_votes_topK = {} + for obj_id in object_image_votes: + object_image_votes_topK[obj_id] = [] + obj_image_votes_f = object_image_votes[obj_id] + sorted_frame_idxs = sorted(obj_image_votes_f, key=obj_image_votes_f.get, reverse=True) + if len(sorted_frame_idxs) > self.top_k: + object_image_votes_topK[obj_id] = sorted_frame_idxs[:self.top_k] + else: + object_image_votes_topK[obj_id] = sorted_frame_idxs + + object_ids_in_image_votes = list(object_image_votes_topK.keys()) + for obj_id in object_ids_in_image_votes: + if obj_id not in list(obj_id_to_label_id_map.keys()): + del object_image_votes_topK[obj_id] + + assert len(list(obj_id_to_label_id_map.keys())) >= len(list(object_image_votes_topK.keys())), 'Mapped < Found' + + object_image_embeddings = {} + for object_id in object_image_votes_topK: + object_image_votes_topK_frames = object_image_votes_topK[object_id] + object_image_embeddings[object_id] = {} + + for frame_idx in object_image_votes_topK_frames: + image_path = osp.join(scene_folder, f'{scan_id}_frames', 'lowres_wide', f'{scan_id}_{frame_idx}.png') + color_img = Image.open(image_path) + object_image_embeddings[object_id][frame_idx] = self.computeImageFeaturesEachObject(scan_id, color_img, object_id, object_anno_2D[frame_idx]) + + return object_image_embeddings, object_image_votes_topK, object_anno_2D.keys() + + def computeImageFeaturesEachObject(self, scan_id, image: Image.Image, object_id: int, object_anno_2d: np.ndarray) -> np.ndarray: + object_anno_2d = object_anno_2d.transpose(1, 0) + object_anno_2d = np.flip(object_anno_2d, 1) + + object_mask = object_anno_2d == object_id + + images_crops = [] + for level in range(self.num_levels): + mask_tensor = torch.from_numpy(object_mask).float() + x1, y1, x2, y2 = image_util.mask2box_multi_level(mask_tensor, level) + cropped_img = image.crop((x1, y1, x2, y2)) + cropped_img = cropped_img.resize((self.model_image_size[1], self.model_image_size[1]), Image.BICUBIC) + img_pt = self.model.base_tf(cropped_img) + images_crops.append(img_pt) + + if(len(images_crops) > 0): + mean_feats = self.extractFeatures(images_crops, return_only_cls_mean = True) + return mean_feats \ No newline at end of file diff --git a/preprocess/feat2D/multiscan.py b/preprocess/feat2D/multiscan.py new file mode 100644 index 0000000..b325a31 --- /dev/null +++ b/preprocess/feat2D/multiscan.py @@ -0,0 +1,231 @@ +import os.path as osp +import open3d as o3d +import numpy as np +import torch +from tqdm import tqdm +from PIL import Image +from scipy.spatial.transform import Rotation as R +import os +from common import load_utils +from util import render, multiscan, visualisation +from util import image as image_util + +from preprocess.build import PROCESSOR_REGISTRY +from preprocess.feat2D.base import Base2DProcessor + + +@PROCESSOR_REGISTRY.register() +class MultiScan2DProcessor(Base2DProcessor): + def __init__(self, config_data, config_2D, split) -> None: + super(MultiScan2DProcessor, self).__init__(config_data, config_2D, split) + self.data_dir = config_data.base_dir + files_dir = osp.join(config_data.base_dir, 'files') + self.split = split + + self.scan_ids = [] + self.scan_ids = multiscan.get_scan_ids(files_dir, split) + + self.out_dir = osp.join(config_data.process_dir, 'scans') + load_utils.ensure_dir(self.out_dir) + + self.orig_image_size = config_2D.image.orig_size + self.model_image_size = config_2D.image.model_size + + self.frame_skip = config_data.skip_frames + self.top_k = config_2D.image.top_k + self.num_levels = config_2D.image.num_levels + self.undefined = 0 + + + # get frame_indexes + self.frame_pose_data = {} + for scan_id in self.scan_ids: + scene_folder = osp.join(self.data_dir, 'scenes', scan_id) + frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=self.frame_skip) + while(len(frame_idxs) > 500): + self.frame_skip += 2 + frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=self.frame_skip) + + pose_data = multiscan.load_all_poses(scene_folder, frame_idxs) + self.frame_pose_data[scan_id] = pose_data + + + def compute2DFeatures(self): + for scan_id in tqdm(self.scan_ids): + self.compute2DImagesAndSeg(scan_id) + self.compute2DFeaturesEachScan(scan_id) + + def compute2DImagesAndSeg(self, scan_id): + scene_folder = osp.join(self.data_dir, 'scenes', scan_id) + obj_id_imgs = {} + + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + mesh_file = osp.join(scene_folder, '{}.ply'.format(scan_id)) + ply_data = multiscan.load_ply_data(osp.join(self.data_dir, 'scenes'), scan_id) + instance_ids = ply_data['objectId'] + + mesh = o3d.io.read_triangle_mesh(mesh_file) + mesh_triangles = np.asarray(mesh.triangles) + colors = np.asarray(mesh.vertex_colors)*255.0 + colors = colors.round() + num_triangles = mesh_triangles.shape[0] + + scene = o3d.t.geometry.RaycastingScene() + scene.add_triangles(o3d.t.geometry.TriangleMesh.from_legacy(mesh)) + + # project 3D model + for frame_idx in self.frame_pose_data[scan_id]: + camera_info = multiscan.load_intrinsics(scene_folder,scan_id,int(frame_idx)) + intrinsics = camera_info['intrinsic_mat'] + img_width = int(camera_info['width']) + img_height = int(camera_info['height']) + img_pose = self.frame_pose_data[scan_id][frame_idx] + img_pose_inv = np.linalg.inv(img_pose) + + obj_id_map = render.project_mesh3DTo2D_with_objectseg( + scene, intrinsics, img_pose_inv, img_width, img_height, + mesh_triangles, num_triangles, instance_ids + ) + obj_id_imgs[frame_idx] = obj_id_map + + np.savez_compressed(osp.join(scene_out_dir,'gt-projection-seg.npz'),**obj_id_imgs) + + def compute2DFeaturesEachScan(self, scan_id): + data2D = {} + + scene_folder = osp.join(self.data_dir, 'scenes', scan_id) + color_path = osp.join(scene_folder, 'sequence') + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + obj_id_to_label_id_map = load_utils.load_npz_as_dict(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'))['obj_id_to_label_id_map'] + + # Multi-view Image -- Object (Embeddings) + object_image_embeddings, object_image_votes_topK, frame_idxs = self.computeImageFeaturesAllObjectsEachScan(scene_folder, scene_out_dir, obj_id_to_label_id_map) + + # Multi-view Image -- Scene (Images + Embeddings) + frame_idxs = list(self.frame_pose_data[scan_id].keys()) + pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs = self.computeSelectedImageFeaturesEachScan(scan_id, color_path, frame_idxs) + + # Visualise + camera_info = multiscan.load_meta_intrinsics(scene_folder,scan_id) + intrinsic_mat = camera_info['intrinsic_mat'] + + scene_mesh = o3d.io.read_triangle_mesh(osp.join(scene_folder,'{}.ply'.format(scan_id))) + intrinsics = { 'f' : intrinsic_mat[0, 0], 'cx' : intrinsic_mat[0, 2], 'cy' : intrinsic_mat[1, 2], + 'w' : int(camera_info['width']), 'h' : int(camera_info['height'])} + + cams_visualised_on_mesh = visualisation.visualise_camera_on_mesh(scene_mesh, pose_data[sampled_frame_idxs], intrinsics, stride=1) + image_path = osp.join(scene_out_dir, 'sel_cams_on_mesh.png') + Image.fromarray((cams_visualised_on_mesh * 255).astype(np.uint8)).save(image_path) + + data2D['objects'] = {'image_embeddings': object_image_embeddings, 'topK_images_votes' : object_image_votes_topK} + data2D['scene'] = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, + 'frame_idxs' : frame_idxs, 'sampled_cam_idxs' : sampled_frame_idxs} + + # dummy floorplan + floorplan_dict = {'img' : None, 'embedding' : None} + data2D['scene']['floorplan'] = floorplan_dict + + np.savez_compressed(osp.join(scene_out_dir, 'data2D.npz'), **data2D) + + def computeSelectedImageFeaturesEachScan(self, scan_id, color_path, frame_idxs): + # Sample Camera Indexes Based on Rotation Matrix From Grid + pose_data = [] + for frame_idx in frame_idxs: + pose = self.frame_pose_data[scan_id][frame_idx] + rot_quat = R.from_matrix(pose[:3, :3]).as_quat() + trans = pose[:3, 3] + pose_data.append([trans[0], trans[1], trans[2], rot_quat[0], rot_quat[1], rot_quat[2], rot_quat[3]]) + + pose_data = np.array(pose_data) + + sampled_frame_idxs = image_util.sample_camera_pos_on_grid(pose_data) + + # Extract Scene Image Features + scene_images_pt = [] + for idx in sampled_frame_idxs: + frame_index = frame_idxs[idx] + + image = Image.open(osp.join(color_path, f'frame-{frame_index}.color.jpg')) + image = image.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC) + image_pt = self.model.base_tf(image) + scene_images_pt.append(image_pt) + + scene_image_embeddings = self.extractFeatures(scene_images_pt, return_only_cls_mean= False) + + return pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs + + def computeImageFeaturesAllObjectsEachScan(self, scene_folder, scene_out_dir, obj_id_to_label_id_map): + # object_anno_2D = torch.load(osp.join(scene_out_dir, 'gt-projection-seg.pt')) + object_anno_2D = np.load(osp.join(scene_out_dir, 'gt-projection-seg.npz'),allow_pickle=True) + + object_image_votes = {} + + # iterate over all frames + for frame_idx in object_anno_2D: + obj_2D_anno_frame = object_anno_2D[frame_idx] + # process 2D anno + obj_ids, counts = np.unique(obj_2D_anno_frame, return_counts=True) + for idx in range(len(obj_ids)): + obj_id = obj_ids[idx] + count = counts[idx] + if obj_id == self.undefined: + continue + + if obj_id not in object_image_votes: + object_image_votes[obj_id] = {} + if frame_idx not in object_image_votes[obj_id]: + object_image_votes[obj_id][frame_idx] = 0 + object_image_votes[obj_id][frame_idx] = count + + # select top K frames for each obj + object_image_votes_topK = {} + for obj_id in object_image_votes: + object_image_votes_topK[obj_id] = [] + obj_image_votes_f = object_image_votes[obj_id] + sorted_frame_idxs = sorted(obj_image_votes_f, key=obj_image_votes_f.get, reverse=True) + if len(sorted_frame_idxs) > self.top_k: + object_image_votes_topK[obj_id] = sorted_frame_idxs[:self.top_k] + else: + object_image_votes_topK[obj_id] = sorted_frame_idxs + + object_ids_in_image_votes = list(object_image_votes_topK.keys()) + for obj_id in object_ids_in_image_votes: + if obj_id not in list(obj_id_to_label_id_map.keys()): + del object_image_votes_topK[obj_id] + + assert len(list(obj_id_to_label_id_map.keys())) >= len(list(object_image_votes_topK.keys())), 'Mapped < Found' + + object_image_embeddings = {} + for object_id in object_image_votes_topK: + object_image_votes_topK_frames = object_image_votes_topK[object_id] + object_image_embeddings[object_id] = {} + + for frame_idx in object_image_votes_topK_frames: + image_path = osp.join(scene_folder, 'sequence', f'frame-{frame_idx}.color.jpg') + color_img = Image.open(image_path) + object_image_embeddings[object_id][frame_idx] = self.computeImageFeaturesEachObject(color_img, object_id, object_anno_2D[frame_idx]) + + return object_image_embeddings, object_image_votes_topK, object_anno_2D.keys() + + def computeImageFeaturesEachObject(self, image, object_id, object_anno_2d): + # load image + object_mask = object_anno_2d == object_id + + images_crops = [] + for level in range(self.num_levels): + mask_tensor = torch.from_numpy(object_mask).float() + x1, y1, x2, y2 = image_util.mask2box_multi_level(mask_tensor, level) + cropped_img = image.crop((x1, y1, x2, y2)) + cropped_img = cropped_img.resize((self.model_image_size[1], self.model_image_size[1]), Image.BICUBIC) + img_pt = self.model.base_tf(cropped_img) + images_crops.append(img_pt) + # images_crops.append(cropped_img) + + + if(len(images_crops) > 0): + mean_feats = self.extractFeatures(images_crops, return_only_cls_mean = True) + return mean_feats \ No newline at end of file diff --git a/preprocess/feat2D/scan3r.py b/preprocess/feat2D/scan3r.py index 4927c97..5b1d307 100644 --- a/preprocess/feat2D/scan3r.py +++ b/preprocess/feat2D/scan3r.py @@ -7,7 +7,7 @@ from scipy.spatial.transform import Rotation as R from omegaconf import DictConfig from typing import List, Dict, Tuple - +import os from common import load_utils from util import render, scan3r, visualisation from util import image as image_util @@ -58,9 +58,13 @@ def compute2DImagesAndSeg(self, scan_id: str) -> None: scene_folder = osp.join(self.data_dir, 'scans', scan_id) mesh_file = osp.join(scene_folder, self.label_filename.replace('.align', '')) - ply_data = scan3r.load_ply_data(self.data_dir, scene_folder, self.label_filename) + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + obj_id_imgs = {} + ply_data = scan3r.load_ply_data(self.data_dir, scan_id, self.label_filename) instance_ids = ply_data['objectId'] - + camera_info = scan3r.load_intrinsics(scene_folder) intrinsics = camera_info['intrinsic_mat'] img_width = int(camera_info['width']) @@ -74,9 +78,8 @@ def compute2DImagesAndSeg(self, scan_id: str) -> None: scene = o3d.t.geometry.RaycastingScene() scene.add_triangles(o3d.t.geometry.TriangleMesh.from_legacy(mesh)) - + # project 3D model - obj_id_imgs = {} for frame_idx in self.frame_pose_data[scan_id]: img_pose = self.frame_pose_data[scan_id][frame_idx] img_pose_inv = np.linalg.inv(img_pose) @@ -87,21 +90,17 @@ def compute2DImagesAndSeg(self, scan_id: str) -> None: ) obj_id_imgs[frame_idx] = obj_id_map - - # save scene-level file for efficient loading - scene_out_dir = osp.join(self.out_dir, scan_id) - load_utils.ensure_dir(scene_out_dir) - - torch.save(obj_id_imgs, osp.join(scene_out_dir, 'gt-projection-seg.pt')) + np.savez_compressed(osp.join(scene_out_dir,'gt-projection-seg.npz'),**obj_id_imgs) def compute2DFeaturesEachScan(self, scan_id: str) -> None: + data2D = {} scene_folder = osp.join(self.data_dir, 'scans', scan_id) color_path = osp.join(scene_folder, 'sequence') scene_out_dir = osp.join(self.out_dir, scan_id) load_utils.ensure_dir(scene_out_dir) - obj_id_to_label_id_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] + obj_id_to_label_id_map = load_utils.load_npz_as_dict(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'))['obj_id_to_label_id_map'] # Multi-view Image -- Object (Embeddings) object_image_embeddings, object_image_votes_topK, frame_idxs = self.computeImageFeaturesAllObjectsEachScan(scene_folder, scene_out_dir, obj_id_to_label_id_map) @@ -122,7 +121,7 @@ def compute2DFeaturesEachScan(self, scan_id: str) -> None: image_path = osp.join(scene_out_dir, 'sel_cams_on_mesh.png') Image.fromarray((cams_visualised_on_mesh * 255).astype(np.uint8)).save(image_path) - data2D = {} + data2D['objects'] = {'image_embeddings': object_image_embeddings, 'topK_images_votes' : object_image_votes_topK} data2D['scene'] = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, 'frame_idxs' : frame_idxs, 'sampled_cam_idxs' : sampled_frame_idxs} @@ -131,7 +130,7 @@ def compute2DFeaturesEachScan(self, scan_id: str) -> None: floorplan_dict = {'img' : None, 'embedding' : None} data2D['scene']['floorplan'] = floorplan_dict - torch.save(data2D, osp.join(scene_out_dir, 'data2D.pt')) + np.savez_compressed(osp.join(scene_out_dir, 'data2D.npz'), **data2D) def computeSelectedImageFeaturesEachScan(self, scan_id: str, color_path: str, frame_idxs: List[int]) -> Tuple[np.ndarray, List[torch.tensor], np.ndarray, List[int]]: # Sample Camera Indexes Based on Rotation Matrix From Grid @@ -162,7 +161,7 @@ def computeSelectedImageFeaturesEachScan(self, scan_id: str, color_path: str, fr return pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs def computeImageFeaturesAllObjectsEachScan(self, scene_folder: str, scene_out_dir: str, obj_id_to_label_id_map: dict) -> Tuple[Dict[int, Dict[int, np.ndarray]], Dict[int, List[int]], List[str]]: - object_anno_2D = torch.load(osp.join(scene_out_dir, 'gt-projection-seg.pt')) + object_anno_2D = np.load(osp.join(scene_out_dir, 'gt-projection-seg.npz'), allow_pickle=True) object_image_votes = {} # iterate over all frames diff --git a/preprocess/feat2D/scannet.py b/preprocess/feat2D/scannet.py index 8c59354..ec3d29d 100644 --- a/preprocess/feat2D/scannet.py +++ b/preprocess/feat2D/scannet.py @@ -3,7 +3,7 @@ import numpy as np import torch from tqdm import tqdm - +import os import imageio import skimage.transform as sktf from PIL import Image @@ -81,12 +81,14 @@ def renderShapeAndFloorplan(self, scene_folder: str, scene_out_folder: str, scan return render_img def compute2DFeaturesEachScan(self, scan_id: str) -> None: + data2D = {} frame_idxs = list(self.frame_pose_data[scan_id].keys()) scene_folder = osp.join(self.data_dir, 'scans', scan_id) scene_out_dir = osp.join(self.out_dir, scan_id) load_utils.ensure_dir(scene_out_dir) + # Floor-plan rendering render_img = self.renderShapeAndFloorplan(scene_folder, scene_out_dir, scan_id) floorplan_embeddings = None @@ -95,7 +97,6 @@ def compute2DFeaturesEachScan(self, scan_id: str) -> None: render_img = render_img.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC) render_img_pt = self.model.base_tf(render_img) floorplan_embeddings = self.extractFeatures([render_img_pt], return_only_cls_mean = False) - floorplan_dict = {'img' : render_img, 'embedding' : floorplan_embeddings} # Multi-view Image -- Object (Embeddings) @@ -117,13 +118,12 @@ def compute2DFeaturesEachScan(self, scan_id: str) -> None: image_path = osp.join(scene_out_dir, 'sel_cams_on_mesh.png') Image.fromarray((cams_visualised_on_mesh * 255).astype(np.uint8)).save(image_path) - data2D = {} data2D['objects'] = {'image_embeddings': object_image_embeddings, 'topK_images_votes' : object_image_votes_topK} data2D['scene'] = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, 'frame_idxs' : frame_idxs, 'sampled_cam_idxs' : sampled_frame_idxs} data2D['scene']['floorplan'] = floorplan_dict - torch.save(data2D, osp.join(scene_out_dir, 'data2D.pt')) + np.savez_compressed(osp.join(scene_out_dir, 'data2D.npz'), **data2D) def computeImageFeaturesEachScan(self, scan_id: str, color_path: str, frame_idxs: List[int]) -> Tuple[np.ndarray, List[torch.tensor], np.ndarray, List[int]]: # Sample Camera Indexes Based on Rotation Matrix From Grid diff --git a/preprocess/feat2D/structured3d.py b/preprocess/feat2D/structured3d.py new file mode 100644 index 0000000..46392c9 --- /dev/null +++ b/preprocess/feat2D/structured3d.py @@ -0,0 +1,247 @@ +import os.path as osp +import open3d as o3d +import numpy as np +import torch +from tqdm import tqdm +import shutil +from PIL import Image +from scipy.spatial.transform import Rotation as R +import cv2 +from common import load_utils +from util import render, structured3d, visualisation +from util import image as image_util +import os +from preprocess.build import PROCESSOR_REGISTRY +from preprocess.feat2D.base import Base2DProcessor + + +@PROCESSOR_REGISTRY.register() +class Structured3D_2DProcessor(Base2DProcessor): + def __init__(self, config_data, config_2D, split) -> None: + super(Structured3D_2DProcessor, self).__init__(config_data, config_2D, split) + self.data_dir = config_data.base_dir + files_dir = osp.join(config_data.base_dir, 'files') + self.split = split + + self.scan_ids = [] + self.scan_ids = structured3d.get_scan_ids(files_dir, split) + + self.out_dir = config_data.process_dir + load_utils.ensure_dir(self.out_dir) + + self.model_image_size = config_2D.image.model_size + + self.frame_skip = config_data.skip_frames + self.top_k = config_2D.image.top_k + self.num_levels = config_2D.image.num_levels + + + self.frame_pose_data = {} + for scan_id in self.scan_ids: + full_scan_id = scan_id + scan_id = scan_id.split('_') + room_id = scan_id[-1] + scan_id = scan_id[0]+'_'+scan_id[1] + scene_folder = osp.join(self.data_dir, 'scans', scan_id, '2D_rendering', room_id, 'perspective', 'full') + frame_idxs = [f for f in os.listdir(scene_folder) if f[0] != '.' and f[0] != 'g'] + pose_data = structured3d.load_all_poses(scene_folder, frame_idxs) + self.frame_pose_data[full_scan_id] = pose_data + + + def compute2DFeatures(self): + for scan_id in tqdm(self.scan_ids): + self.compute2DImagesAndSeg(scan_id) + self.compute2DFeaturesEachScan(scan_id) + + def compute2DImagesAndSeg(self, scan_id): + full_scan_id = scan_id + scan_id = scan_id.split('_') + room_id = scan_id[-1] + scan_id = scan_id[0]+'_'+scan_id[1] + scene_folder = osp.join(self.data_dir, 'scans', scan_id,'2D_rendering', room_id, 'perspective', 'full') + + obj_id_imgs = {} + for frame_idx in self.frame_pose_data[full_scan_id]: + image_path=osp.join(scene_folder, frame_idx, 'instance.png') + obj_id_map = cv2.imread(image_path, cv2.IMREAD_UNCHANGED) + obj_id_imgs[frame_idx] = obj_id_map + + if osp.exists(osp.join(scene_folder, 'gt-projection')): + shutil.rmtree(osp.join(scene_folder, 'gt-projection')) + + np.savez_compressed(osp.join(scene_folder,'gt-projection-seg.npz'),**obj_id_imgs) + + def compute2DFeaturesEachScan(self, scan_id): + full_scan_id = scan_id + scan_id = scan_id.split('_') + room_id = scan_id[-1] + scan_id = scan_id[0]+'_'+scan_id[1] + scene_folder = osp.join(self.data_dir, 'scans', scan_id,'2D_rendering', room_id, 'perspective', 'full') + + scene_out_dir = osp.join(self.out_dir, full_scan_id) + load_utils.ensure_dir(scene_out_dir) + + obj_id_to_label_id_map = load_utils.load_npz_as_dict(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'))['obj_id_to_label_id_map'] + + floorplan_img_path = osp.join(self.data_dir,'scans', scan_id, 'floorplans', f'{room_id}.png') + floorplan_img = cv2.imread(floorplan_img_path) + floorplan_img = cv2.cvtColor(floorplan_img, cv2.COLOR_BGR2RGB) + floorplan_img = cv2.cvtColor(floorplan_img, cv2.COLOR_RGB2GRAY) + floorplan_img = cv2.cvtColor(floorplan_img, cv2.COLOR_GRAY2RGB) + floorplan_img = image_util.crop_image(floorplan_img, floorplan_img_path.replace('.png', '_cropped.png')) + floorplan_embeddings = None + + if floorplan_img is not None: + floorplan_img = floorplan_img.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC) + floorplan_img_pt = self.model.base_tf(floorplan_img) + floorplan_embeddings = self.extractFeatures([floorplan_img_pt], return_only_cls_mean = False) + floorplan_dict = {'img' : floorplan_img, 'embedding' : floorplan_embeddings} + + # Multi-view Image -- Object (Embeddings) + object_image_embeddings, object_image_votes_topK, frame_idxs = self.computeImageFeaturesAllObjectsEachScan(scene_folder, obj_id_to_label_id_map) + + # Multi-view Image -- Scene (Images + Embeddings) + frame_idxs = list(self.frame_pose_data[full_scan_id].keys()) + pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs = self.computeSelectedImageFeaturesEachScan(full_scan_id, scene_folder, frame_idxs) + + data2D = {} + data2D['objects'] = {'image_embeddings': object_image_embeddings, 'topK_images_votes' : object_image_votes_topK} + data2D['scene'] = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, + 'frame_idxs' : frame_idxs, 'sampled_cam_idxs' : sampled_frame_idxs} + + data2D['scene']['floorplan'] = floorplan_dict + + # torch.save(data2D, osp.join(scene_out_dir, 'data2D.pt')) + np.savez_compressed(osp.join(scene_out_dir, 'data2D.npz'), **data2D) + + # def computeAllImageFeaturesEachScan(self, scan_id): + # scene_folder = osp.join(self.data_dir, 'scenes', scan_id) + # color_path = osp.join(scene_folder, 'sequence') + # scene_out_dir = osp.join(self.out_dir, scan_id) + # load_utils.ensure_dir(scene_out_dir) + + # frame_idxs = list(self.frame_pose_data[scan_id].keys()) + + # # Extract Scene Image Features + # scene_images_pt = [] + # scene_image_embeddings = [] + # for frame_index in frame_idxs: + # image = Image.open(osp.join(color_path, f'frame-{frame_index}.color.jpg')) + # image = image.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC) + # image_pt = self.model.base_tf(image) + # # image_pt = torch.zeros(1, 1536) + + # scene_image_embeddings.append(self.extractFeatures([image_pt], return_only_cls_mean= False)) + # scene_images_pt.append(image_pt) + # scene_image_embeddings = np.concatenate(scene_image_embeddings) + # data2D = {} + # data2D['scene'] = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, + # 'frame_idxs' : frame_idxs} + # torch.save(data2D, osp.join(scene_out_dir, 'data2D_all_images.pt')) + # np.savez_compressed(osp.join(scene_out_dir, 'data2D_all_images.npz'), **data2D) + + def computeSelectedImageFeaturesEachScan(self, scan_id, color_path, frame_idxs): + # Sample Camera Indexes Based on Rotation Matrix From Grid + pose_data = [] + for frame_idx in frame_idxs: + pose = self.frame_pose_data[scan_id][frame_idx] + rot_quat = R.from_matrix(pose[:3, :3]).as_quat() + trans = pose[:3, 3] + pose_data.append([trans[0], trans[1], trans[2], rot_quat[0], rot_quat[1], rot_quat[2], rot_quat[3]]) + + pose_data = np.array(pose_data) + + sampled_frame_idxs = image_util.sample_camera_pos_on_grid(pose_data) + # print(sampled_frame_idxs) + scene_images_pt = [] + for idx in sampled_frame_idxs: + frame_index = frame_idxs[idx] + + image = Image.open(osp.join(color_path, frame_index, f'rgb_rawlight.png')) + image = image.convert('RGB') + image = image.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC) + image_pt = self.model.base_tf(image) + scene_images_pt.append(image_pt) + + + scene_image_embeddings = self.extractFeatures(scene_images_pt, return_only_cls_mean= False) + + return pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs + + def computeImageFeaturesAllObjectsEachScan(self, scene_folder, obj_id_to_label_id_map): + # object_anno_2D = torch.load(osp.join(scene_folder, 'gt-projection-seg.pt')) + object_anno_2D = np.load(osp.join(scene_folder, 'gt-projection-seg.npz'),allow_pickle=True) + object_image_votes = {} + + # iterate over all frames + for frame_idx in object_anno_2D: + obj_2D_anno_frame = object_anno_2D[frame_idx] + # process 2D anno + obj_ids, counts = np.unique(obj_2D_anno_frame, return_counts=True) + for idx in range(len(obj_ids)): + obj_id = obj_ids[idx] + count = counts[idx] + + if obj_id not in object_image_votes: + object_image_votes[obj_id] = {} + if frame_idx not in object_image_votes[obj_id]: + object_image_votes[obj_id][frame_idx] = 0 + object_image_votes[obj_id][frame_idx] = count + + # select top K frames for each obj + object_image_votes_topK = {} + for obj_id in object_image_votes: + object_image_votes_topK[obj_id] = [] + obj_image_votes_f = object_image_votes[obj_id] + sorted_frame_idxs = sorted(obj_image_votes_f, key=obj_image_votes_f.get, reverse=True) + if len(sorted_frame_idxs) > self.top_k: + object_image_votes_topK[obj_id] = sorted_frame_idxs[:self.top_k] + else: + object_image_votes_topK[obj_id] = sorted_frame_idxs + + object_ids_in_image_votes = list(object_image_votes_topK.keys()) + for obj_id in object_ids_in_image_votes: + if obj_id not in list(obj_id_to_label_id_map.keys()): + del object_image_votes_topK[obj_id] + + assert len(list(obj_id_to_label_id_map.keys())) >= len(list(object_image_votes_topK.keys())), 'Mapped < Found' + + object_image_embeddings = {} + for object_id in object_image_votes_topK: + object_image_votes_topK_frames = object_image_votes_topK[object_id] + object_image_embeddings[object_id] = {} + + for frame_idx in object_image_votes_topK_frames: + image_path = osp.join(scene_folder, frame_idx, 'rgb_rawlight.png') + # print(image_path) + color_img = Image.open(image_path) + # print(color_img.mode) + color_img = color_img.convert('RGB') + object_image_embeddings[object_id][frame_idx] = self.computeImageFeaturesEachObject(color_img, object_id, object_anno_2D[frame_idx]) + + return object_image_embeddings, object_image_votes_topK, object_anno_2D.keys() + + def computeImageFeaturesEachObject(self, image, object_id, object_anno_2d): + # print(np.array(image).shape) + object_anno_2d = object_anno_2d.transpose(1, 0) + object_anno_2d = np.flip(object_anno_2d, 1) + + # load image + object_mask = object_anno_2d == object_id + + images_crops = [] + for level in range(self.num_levels): + mask_tensor = torch.from_numpy(object_mask).float() + x1, y1, x2, y2 = image_util.mask2box_multi_level(mask_tensor, level) + cropped_img = image.crop((x1, y1, x2, y2)) + # print(np.array(cropped_img).shape) + cropped_img = cropped_img.resize((self.model_image_size[1], self.model_image_size[1]), Image.BICUBIC) + img_pt = self.model.base_tf(cropped_img) + images_crops.append(img_pt) + # images_crops.append(cropped_img) + + + if(len(images_crops) > 0): + mean_feats = self.extractFeatures(images_crops, return_only_cls_mean = True) + return mean_feats + diff --git a/preprocess/feat3D/__init__.py b/preprocess/feat3D/__init__.py index 9a1b744..0e3df30 100644 --- a/preprocess/feat3D/__init__.py +++ b/preprocess/feat3D/__init__.py @@ -1,2 +1,5 @@ from .scannet import * -from .scan3r import * \ No newline at end of file +from .scan3r import * +from .arkit import * +from .multiscan import * +from .structured3d import * diff --git a/preprocess/feat3D/arkit.py b/preprocess/feat3D/arkit.py new file mode 100644 index 0000000..74f66c2 --- /dev/null +++ b/preprocess/feat3D/arkit.py @@ -0,0 +1,93 @@ +import os.path as osp +import open3d as o3d +import numpy as np +import os +from common import load_utils +from util import arkit +from util.arkit import ARKITSCENE_SCANNET +from preprocess.build import PROCESSOR_REGISTRY +from preprocess.feat3D.base import Base3DProcessor + +@PROCESSOR_REGISTRY.register() +class ARKitScenes3DProcessor(Base3DProcessor): + def __init__(self, config_data, config_3D, split) -> None: + super(ARKitScenes3DProcessor, self).__init__(config_data, config_3D, split) + self.data_dir = config_data.base_dir + + files_dir = osp.join(config_data.base_dir, 'files') + + self.scan_ids = [] + self.scan_ids = arkit.get_scan_ids(files_dir, split) + + self.out_dir = osp.join(config_data.process_dir, 'scans') + load_utils.ensure_dir(self.out_dir) + self.label_map = arkit.read_label_map(files_dir, label_from = 'raw_category', label_to = 'nyu40id') + + self.undefined = 0 + + + def load_objects_for_scan(self, scan_id): + """Load and parse the annotations JSON for the given scan ID.""" + objects_path = osp.join(self.data_dir, 'scans', scan_id, f"{scan_id}_3dod_annotation.json") + if not osp.exists(objects_path): + raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}") + + annotations = load_utils.load_json(objects_path) + + objects = [] + for _i, label_info in enumerate(annotations["data"]): + obj_label = label_info["label"] + object_id = _i + 1 + scannet_class=ARKITSCENE_SCANNET[obj_label] + nyu40id=self.label_map[scannet_class] + objects.append({ + "objectId": object_id, + "global_id": nyu40id + }) + + return objects + + def compute3DFeaturesEachScan(self, scan_id): + objects_path = osp.join(self.data_dir, 'scans', scan_id, f"{scan_id}_3dod_annotation.json") + if not osp.exists(objects_path): + raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}") + + annotations = load_utils.load_json(objects_path) + ply_data = arkit.load_ply_data(osp.join(self.data_dir, 'scans'), scan_id, annotations) + mesh_points = np.stack([ply_data['x'], ply_data['y'], ply_data['z']]).transpose((1, 0)) + + mesh = o3d.io.read_triangle_mesh(osp.join(self.data_dir, 'scans', scan_id,'{}_3dod_mesh.ply'.format(scan_id))) + mesh_colors = np.asarray(mesh.vertex_colors)*255.0 + mesh_colors = mesh_colors.round() + + scan_objects=self.load_objects_for_scan(scan_id) + + object_pcl_embeddings, object_cad_embeddings = {}, {} + object_id_to_label_id = {} + for idx, scan_object in enumerate(scan_objects): + instance_id = int(scan_object['objectId']) + global_object_id = scan_object['global_id'] + + object_pcl = mesh_points[np.where(ply_data['objectId'] == instance_id)] + + if object_pcl.shape[0] <= self.config_3D.min_points_per_object: + continue + + assert instance_id not in object_id_to_label_id + object_id_to_label_id[instance_id] = global_object_id + + if object_pcl.shape[0] >= self.config_3D.min_points_per_object: + object_pcl_embeddings[instance_id] = self.normalizeObjectPCLAndExtractFeats(object_pcl) + + data3D = {} + data3D['objects'] = {'pcl_embeddings' : object_pcl_embeddings, 'cad_embeddings': object_cad_embeddings} + data3D['scene'] = {'pcl_coords': mesh_points[ply_data['objectId'] != self.undefined], 'pcl_feats': mesh_colors[ply_data['objectId'] != self.undefined], 'scene_label' : None} + + object_id_to_label_id_map = { 'obj_id_to_label_id_map' : object_id_to_label_id} + + assert len(list(object_id_to_label_id.keys())) >= len(list(object_pcl_embeddings.keys())), 'PC does not match for {}'.format(scan_id) + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + np.savez_compressed(osp.join(scene_out_dir, 'data3D.npz'), **data3D) + np.savez_compressed(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'), **object_id_to_label_id_map) \ No newline at end of file diff --git a/preprocess/feat3D/multiscan.py b/preprocess/feat3D/multiscan.py new file mode 100644 index 0000000..e2c047e --- /dev/null +++ b/preprocess/feat3D/multiscan.py @@ -0,0 +1,89 @@ +import os.path as osp +import open3d as o3d +import numpy as np +import os +from common import load_utils +from util import multiscan +from util.multiscan import MULTISCAN_SCANNET +from preprocess.build import PROCESSOR_REGISTRY +from preprocess.feat3D.base import Base3DProcessor + +@PROCESSOR_REGISTRY.register() +class MultiScan3DProcessor(Base3DProcessor): + def __init__(self, config_data, config_3D, split) -> None: + super(MultiScan3DProcessor, self).__init__(config_data, config_3D, split) + self.data_dir = config_data.base_dir + + files_dir = osp.join(config_data.base_dir, 'files') + + self.scan_ids = [] + self.scan_ids = multiscan.get_scan_ids(files_dir, split) + + self.out_dir = osp.join(config_data.process_dir, 'scans') + load_utils.ensure_dir(self.out_dir) + self.label_map = multiscan.read_label_map(files_dir, label_from = 'raw_category', label_to = 'nyu40id') + + self.undefined = 0 + + + def load_objects_for_scan(self, scan_id): + """Load and parse the annotations JSON for the given scan ID.""" + objects_path = osp.join(self.data_dir, 'scenes', scan_id, f"{scan_id}.annotations.json") + if not osp.exists(objects_path): + raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}") + + annotations = load_utils.load_json(objects_path) + + objects = [] + + for obj in annotations["objects"]: + object_id=obj["objectId"] + objectName=obj["label"].split('.')[0] + scannet_class=MULTISCAN_SCANNET[objectName] + nyu40id=self.label_map[scannet_class] + objects.append({ + "objectId": object_id, + "global_id": nyu40id + }) + + return objects + + def compute3DFeaturesEachScan(self, scan_id): + ply_data = multiscan.load_ply_data(osp.join(self.data_dir, 'scenes'), scan_id) + mesh_points = np.stack([ply_data['x'], ply_data['y'], ply_data['z']]).transpose((1, 0)) + + mesh = o3d.io.read_triangle_mesh(osp.join(self.data_dir, 'scenes', scan_id,'{}.ply'.format(scan_id))) + mesh_colors = np.asarray(mesh.vertex_colors)*255.0 + mesh_colors = mesh_colors.round() + + scan_objects=self.load_objects_for_scan(scan_id) + + object_pcl_embeddings, object_cad_embeddings = {}, {} + object_id_to_label_id = {} + for idx, scan_object in enumerate(scan_objects): + instance_id = int(scan_object['objectId']) + global_object_id = scan_object['global_id'] + + object_pcl = mesh_points[np.where(ply_data['objectId'] == instance_id)] + + if object_pcl.shape[0] <= self.config_3D.min_points_per_object: + continue + + assert instance_id not in object_id_to_label_id + object_id_to_label_id[instance_id] = global_object_id + + if object_pcl.shape[0] >= self.config_3D.min_points_per_object: + object_pcl_embeddings[instance_id] = self.normalizeObjectPCLAndExtractFeats(object_pcl) + + data3D = {} + data3D['objects'] = {'pcl_embeddings' : object_pcl_embeddings, 'cad_embeddings': object_cad_embeddings} + data3D['scene'] = {'pcl_coords': mesh_points[ply_data['objectId'] != self.undefined], 'pcl_feats': mesh_colors[ply_data['objectId'] != self.undefined], 'scene_label' : None} + + object_id_to_label_id_map = { 'obj_id_to_label_id_map' : object_id_to_label_id} + + assert len(list(object_id_to_label_id.keys())) >= len(list(object_pcl_embeddings.keys())), 'PC does not match for {}'.format(scan_id) + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + np.savez_compressed(osp.join(scene_out_dir, 'data3D.npz'), **data3D) + np.savez_compressed(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'), **object_id_to_label_id_map) diff --git a/preprocess/feat3D/scan3r.py b/preprocess/feat3D/scan3r.py index 7b949ca..b015609 100644 --- a/preprocess/feat3D/scan3r.py +++ b/preprocess/feat3D/scan3r.py @@ -1,13 +1,12 @@ +import os import os.path as osp import open3d as o3d import numpy as np import torch -from tqdm import tqdm from omegaconf import DictConfig -from typing import Any, Dict from common import load_utils -from util import point_cloud, scan3r +from util import scan3r from preprocess.build import PROCESSOR_REGISTRY from preprocess.feat3D.base import Base3DProcessor @@ -43,12 +42,17 @@ def __init__(self, config_data: DictConfig, config_3D: DictConfig, split: str) - self.feature_extractor = self.loadFeatureExtractor(config_3D, "3D") def compute3DFeaturesEachScan(self, scan_id: str) -> None: + """ + Computes 3D features for a single scan. + """ ply_data = scan3r.load_ply_data(osp.join(self.data_dir, 'scans'), scan_id, self.label_filename) mesh_points = np.stack([ply_data['x'], ply_data['y'], ply_data['z']]).transpose((1, 0)) - mesh = o3d.io.read_triangle_mesh(osp.join(self.data_dir, 'scans', scan_id, self.label_filename)) - mesh_colors = np.asarray(mesh.vertex_colors)*255.0 - mesh_colors = mesh_colors.round() + # mesh = o3d.io.read_triangle_mesh(osp.join(self.data_dir, 'scans', scan_id, self.label_filename)) + # mesh_colors = np.asarray(mesh.vertex_colors)*255.0 + # mesh_colors = mesh_colors.round() + mesh_colors = np.stack([ply_data['red'], ply_data['green'], ply_data['blue']]).transpose((1, 0)) + scan_objects = [obj_data for obj_data in self.objects if obj_data['scan'] == scan_id][0]['objects'] @@ -79,5 +83,5 @@ def compute3DFeaturesEachScan(self, scan_id: str) -> None: scene_out_dir = osp.join(self.out_dir, scan_id) load_utils.ensure_dir(scene_out_dir) - torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt')) - torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt')) \ No newline at end of file + np.savez_compressed(osp.join(scene_out_dir, 'data3D.npz'), **data3D) + np.savez_compressed(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'), **object_id_to_label_id_map) \ No newline at end of file diff --git a/preprocess/feat3D/scannet.py b/preprocess/feat3D/scannet.py index e530195..e4380b9 100644 --- a/preprocess/feat3D/scannet.py +++ b/preprocess/feat3D/scannet.py @@ -1,5 +1,5 @@ import os.path as osp - +import os import numpy as np import torch from omegaconf import DictConfig @@ -64,7 +64,7 @@ def compute3DFeaturesEachScan(self, scan_id: str) -> None: mesh_points = mesh_vertices[:, 0:3] mesh_colors = mesh_vertices[:, 3:] - text_file = mesh_file.replace('_vh_clean_2.labels.ply' , '.txt') + text_file = mesh_file.replace('_vh_clean_2.ply' , '.txt') with open(text_file, 'r') as file: for line in file: if line.startswith('sceneType'): @@ -79,10 +79,7 @@ def compute3DFeaturesEachScan(self, scan_id: str) -> None: if len(shape_annot) > 0: shape_annot = shape_annot[0] shape_annot_to_instance_map = scannet.get_cad_model_to_instance_mapping(instance_bboxes, shape_annot, meta_file, self.shape_dir) - - render_out_dir = osp.join(scene_out_dir, 'render') - load_utils.ensure_dir(render_out_dir) - + for instance_id in unique_instance_ids: if instance_id == self.undefined: continue @@ -98,11 +95,7 @@ def compute3DFeaturesEachScan(self, scan_id: str) -> None: shape_annot_instance = shape_annot_to_instance_map[instance_id] object_cad_pcl = shape_annot_instance['points'] object_cad_embeddings[instance_id] = self.normalizeObjectPCLAndExtractFeats(object_cad_pcl) - - obj_verts, obj_faces, transform_shape = shape_annot_instance['verts'], shape_annot_instance['faces'], shape_annot_instance['transform_shape'] - # load_utils.ensure_dir(osp.join(render_out_dir, f'{instance_id}')) - # render.render_multiview_images(obj_verts, obj_faces, transform_shape, osp.join(render_out_dir, f'{instance_id}')) - + data3D = {} data3D['objects'] = {'pcl_embeddings' : object_pcl_embeddings, 'cad_embeddings': object_cad_embeddings} data3D['scene'] = {'pcl_coords': mesh_points[instance_ids != self.undefined], 'pcl_feats': mesh_colors[instance_ids != self.undefined], 'scene_label' : scene_label} @@ -112,7 +105,5 @@ def compute3DFeaturesEachScan(self, scan_id: str) -> None: assert len(list(object_id_to_label_id.keys())) >= len(list(object_pcl_embeddings.keys())), 'PC does not match for {}'.format(scan_id) assert len(list(object_id_to_label_id.keys())) >= len(list(object_cad_embeddings.keys())), 'CAD does not match for {}'.format(scan_id) - - - torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt')) - torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt')) \ No newline at end of file + np.savez_compressed(osp.join(scene_out_dir, 'data3D.npz'), **data3D) + np.savez_compressed(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'), **object_id_to_label_id_map) \ No newline at end of file diff --git a/preprocess/feat3D/structured3d.py b/preprocess/feat3D/structured3d.py new file mode 100644 index 0000000..2be5fe5 --- /dev/null +++ b/preprocess/feat3D/structured3d.py @@ -0,0 +1,83 @@ +import os.path as osp +import open3d as o3d +import numpy as np +import torch +from tqdm import tqdm +import json +from common import load_utils +from util import structured3d +from preprocess.build import PROCESSOR_REGISTRY +from preprocess.feat3D.base import Base3DProcessor + +@PROCESSOR_REGISTRY.register() +class Structured3D_3DProcessor(Base3DProcessor): + def __init__(self, config_data, config_3D, split) -> None: + super(Structured3D_3DProcessor, self).__init__(config_data, config_3D, split) + self.data_dir = config_data.base_dir + + files_dir = osp.join(config_data.base_dir, 'files') + + self.scan_ids = [] + self.scan_ids = structured3d.get_scan_ids(files_dir, split) + + self.out_dir = config_data.process_dir + load_utils.ensure_dir(self.out_dir) + + def compute3DFeaturesEachScan(self, scan_id): + scan_id = scan_id.split('_') + room_id = scan_id[-1] + scan_id = scan_id[0]+'_'+scan_id[1] + ply_data = structured3d.load_ply_data(osp.join(self.data_dir, 'scans'), scan_id, room_id) + mesh_points = np.stack([ply_data['x'], ply_data['y'], ply_data['z']]).transpose((1, 0)) + + mesh_colors = np.stack([ply_data['red'], ply_data['green'], ply_data['blue']]).transpose((1, 0)) + + object_ids = ply_data['objectId'] + unique_objects = np.unique(object_ids) + semantic_ids = ply_data['nyu40id'] + + scene_label = None + with open(osp.join(self.data_dir, 'scans', scan_id, 'annotation_3d.json')) as file: + annotations = json.load(file) + + for annos in annotations['semantics']: + if annos['ID'] == int(room_id): + scene_label = annos['type'].strip() + break + + + object_pcl_embeddings, object_cad_embeddings = {}, {} + object_id_to_label_id = {} + + for idx, instance_id in enumerate(unique_objects): + object_pcl=mesh_points[np.where(ply_data['objectId'] == instance_id)] + if object_pcl.shape[0] <= self.config_3D.min_points_per_object: + continue + + assert instance_id not in object_id_to_label_id + + all_point_indices = np.where(object_ids == instance_id)[0] + nyu40ids_for_object = semantic_ids[all_point_indices] + unique_ids, counts = np.unique(nyu40ids_for_object, return_counts=True) + nyu40id = unique_ids[np.argmax(counts)] + object_id_to_label_id[instance_id] = nyu40id + + + if object_pcl.shape[0] >= self.config_3D.min_points_per_object: + object_pcl_embeddings[instance_id] = self.normalizeObjectPCLAndExtractFeats(object_pcl) + else: + print("Object {} has less than {} points".format(instance_id, self.config_3D.min_points_per_object)) + + data3D = {} + data3D['objects'] = {'pcl_embeddings' : object_pcl_embeddings, 'cad_embeddings': object_cad_embeddings} + data3D['scene'] = {'pcl_coords': mesh_points, 'pcl_feats': mesh_colors, 'scene_label' : scene_label} + + object_id_to_label_id_map = { 'obj_id_to_label_id_map' : object_id_to_label_id} + + assert len(list(object_id_to_label_id.keys())) >= len(list(object_pcl_embeddings.keys())), 'PC does not match for {}'.format(scan_id) + scene_out_dir = osp.join(self.out_dir, scan_id+'_'+room_id) + load_utils.ensure_dir(scene_out_dir) + + np.savez_compressed(osp.join(scene_out_dir, 'data3D.npz'), **data3D) + np.savez_compressed(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'), **object_id_to_label_id_map) + diff --git a/preprocess/multimodal_preprocess.py b/preprocess/multimodal_preprocess.py index 822135d..e3ab69e 100644 --- a/preprocess/multimodal_preprocess.py +++ b/preprocess/multimodal_preprocess.py @@ -2,13 +2,12 @@ import numpy as np from functools import reduce from operator import add -import torch from tqdm import tqdm from omegaconf import DictConfig import h5py from common import load_utils from common.constants import ModalityType -from util import scan3r, scannet +from util import scan3r, scannet, arkit, multiscan, structured3d from typing import Dict, Optional from preprocess.build import PROCESSOR_REGISTRY @@ -33,6 +32,12 @@ def __init__(self, config_data: DictConfig, modality_config: DictConfig, split: self.scan_ids = scannet.get_scan_ids(self.files_dir, self.split) elif self.dataset_name == 'Scan3R': self.scan_ids = scan3r.get_scan_ids(self.files_dir, self.split) + elif self.dataset_name == 'ARKitScenes': + self.scan_ids = arkit.get_scan_ids(self.files_dir, self.split) + elif self.dataset_name == 'MultiScan': + self.scan_ids = multiscan.get_scan_ids(self.files_dir, self.split) + elif self.dataset_name == 'Structured3D': + self.scan_ids = structured3d.get_scan_ids(self.files_dir, self.split) else: raise NotImplementedError @@ -71,7 +76,8 @@ def prepareObjectWiseDataEachScan(self, data2D: Optional[Dict] = None, data3D: Optional[Dict] = None) -> Dict: """Process object-wise data for a single scan combining features from all modalities.""" - object_id_to_label_id_map = torch.load(osp.join(out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] + object_id_to_label_id_map = load_utils.load_npz_as_dict(osp.join(out_dir, 'object_id_to_label_id_map.npz'))['obj_id_to_label_id_map'] + map_object_ids = list(object_id_to_label_id_map.keys()) precomputed_feats, inputs = {}, {} @@ -139,17 +145,16 @@ def prepareObjectWiseDataEachScan(self, 'object_ids' : object_ids, 'topK_images_votes' : data2D['objects']['topK_images_votes'] } - - torch.save(objects_data_pt, osp.join(out_dir, 'objectsDataMultimodal.pt')) + np.savez_compressed(osp.join(out_dir, 'objectsDataMultimodal.npz'), **objects_data_pt) return objects_data_pt def prepareDataEachScan(self, scan_id: str, hf_handler: h5py.File) -> None: """Process data for a single scan and store it in the HDF5 file.""" out_dir = osp.join(self.out_dir, scan_id) - data1D = torch.load(osp.join(out_dir, 'data1D.pt')) - data2D = torch.load(osp.join(out_dir, 'data2D.pt')) - data3D = torch.load(osp.join(out_dir, 'data3D.pt')) + data1D = load_utils.load_npz_as_dict(osp.join(out_dir, 'data1D.npz')) + data2D = load_utils.load_npz_as_dict(osp.join(out_dir, 'data2D.npz')) + data3D = load_utils.load_npz_as_dict(osp.join(out_dir, 'data3D.npz')) objects_data_pt = self.prepareObjectWiseDataEachScan(out_dir, data1D, data2D, data3D) self.dumpEachObjectDataPerScan(scan_id, objects_data_pt, hf_handler) diff --git a/retrieval/object_retrieval.py b/retrieval/object_retrieval.py index 54c144f..526e5a2 100644 --- a/retrieval/object_retrieval.py +++ b/retrieval/object_retrieval.py @@ -293,6 +293,6 @@ def run(self) -> None: # Object Retrieval Evaluation self.eval(output_dict) - self.logger.info('Scene Retrieval Evaluation (Instance Baseline)...') + self.logger.info('Scene Retrieval Evaluation (Instance CrossOver)...') # Scene Retrieval Evaluation self.scene_eval(output_dict) \ No newline at end of file diff --git a/scripts/evaluation/eval_object_retrieval.sh b/scripts/evaluation/eval_object_retrieval.sh index 23f84f1..b6c37ea 100644 --- a/scripts/evaluation/eval_object_retrieval.sh +++ b/scripts/evaluation/eval_object_retrieval.sh @@ -3,17 +3,17 @@ export PYTHONWARNINGS="ignore" # Change val according to the dataset you want to evaluate on # Instance Baseline -python run_evaluation.py --config-path "$(pwd)/configs/evaluation" \ ---config-name eval_instance.yaml \ -task.InferenceObjectRetrieval.val=['Scannet'] \ -task.InferenceObjectRetrieval.ckpt_path=/drive/dumps/multimodal-spaces/runs/release_runs/instance_baseline_scannet+scan3r.pth \ -model.name=ObjectLevelEncoder \ -hydra.run.dir=. hydra.output_subdir=null +# python run_evaluation.py --config-path "$(pwd)/configs/evaluation" \ +# --config-name eval_instance.yaml \ +# task.InferenceObjectRetrieval.val=['Scannet'] \ +# task.InferenceObjectRetrieval.ckpt_path=/drive/dumps/multimodal-spaces/runs/release_runs/instance_baseline_scannet+scan3r.pth \ +# model.name=ObjectLevelEncoder \ +# hydra.run.dir=. hydra.output_subdir=null # Instance CrossOver python run_evaluation.py --config-path "$(pwd)/configs/evaluation" \ --config-name eval_instance.yaml \ -task.InferenceObjectRetrieval.val=['Scan3R'] \ -task.InferenceObjectRetrieval.ckpt_path=/drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r.pth \ +task.InferenceObjectRetrieval.val=['ARKitScenes'] \ +task.InferenceObjectRetrieval.ckpt_path=/drive/dumps/multimodal-spaces/runs/new_runs/instance_crossover_scannet+scan3r+multiscan+arkitscenes.pth \ model.name=SceneLevelEncoder \ hydra.run.dir=. hydra.output_subdir=null \ No newline at end of file diff --git a/scripts/evaluation/eval_scene_retrieval.sh b/scripts/evaluation/eval_scene_retrieval.sh index 1752e84..1a1f397 100644 --- a/scripts/evaluation/eval_scene_retrieval.sh +++ b/scripts/evaluation/eval_scene_retrieval.sh @@ -2,6 +2,6 @@ export PYTHONWARNINGS="ignore" # Scene Retrieval Inference python run_evaluation.py --config-path "$(pwd)/configs/evaluation" --config-name eval_scene.yaml \ -task.InferenceSceneRetrieval.val=['Scan3R'] \ -task.InferenceSceneRetrieval.ckpt_path=/drive/dumps/multimodal-spaces/runs/release_runs/scene_crossover_scannet+scan3r.pth \ +task.InferenceSceneRetrieval.val=['ARKitScenes'] \ +task.InferenceSceneRetrieval.ckpt_path=/drive/dumps/multimodal-spaces/runs/UnifiedTrain_Scannet+Scan3R+MultiScan+ARKitScenes/2025-07-03-07:39:02.553100/ckpt/best.pth \ hydra.run.dir=. hydra.output_subdir=null \ No newline at end of file diff --git a/scripts/preprocess/calculate_color_stats.sh b/scripts/preprocess/calculate_color_stats.sh new file mode 100644 index 0000000..238088c --- /dev/null +++ b/scripts/preprocess/calculate_color_stats.sh @@ -0,0 +1 @@ +python3 preprocess/calculate_color_stats.py --config-path /Users/gauravpradeep/CrossOver_ScaleUp/configs/preprocess --config-name process_3d.yaml --datasets Structured3D \ No newline at end of file diff --git a/scripts/preprocess/process_arkit.sh b/scripts/preprocess/process_arkit.sh new file mode 100644 index 0000000..466f751 --- /dev/null +++ b/scripts/preprocess/process_arkit.sh @@ -0,0 +1,9 @@ +export PYTHONWARNINGS="ignore" + +# Preprocessing Object Level + Scene Level + Unified Data +python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_3d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null +# python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_2d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null +# python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_1d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null + +# Multi-modal dumping +python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_multimodal.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null diff --git a/scripts/preprocess/process_multiscan.sh b/scripts/preprocess/process_multiscan.sh new file mode 100644 index 0000000..a13a93c --- /dev/null +++ b/scripts/preprocess/process_multiscan.sh @@ -0,0 +1,9 @@ +export PYTHONWARNINGS="ignore" + +# Preprocessing Object Level + Scene Level + Unified Data +python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_3d.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null +python preprocessor.py --config-path /"$(pwd)/configs/preprocess" --config-name process_2d.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null +python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_1d.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null + +# Multi-modal dumping +python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_multimodal.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null \ No newline at end of file diff --git a/scripts/preprocess/process_scan3r.sh b/scripts/preprocess/process_scan3r.sh index 6d8a981..0adfbae 100644 --- a/scripts/preprocess/process_scan3r.sh +++ b/scripts/preprocess/process_scan3r.sh @@ -1,9 +1,8 @@ export PYTHONWARNINGS="ignore" # Preprocessing Object Level + Scene Level + Unified Data -# python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_3d.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null +python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_3d.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null # python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_2d.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null # python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_1d.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null - # Multi-modal dumping python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_multimodal.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null \ No newline at end of file diff --git a/scripts/preprocess/process_scannet.sh b/scripts/preprocess/process_scannet.sh index 68a2366..ec86441 100644 --- a/scripts/preprocess/process_scannet.sh +++ b/scripts/preprocess/process_scannet.sh @@ -1,9 +1,8 @@ export PYTHONWARNINGS="ignore" # Preprocessing Object Level + Scene Level + Unified Data -python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_3d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null -python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_2d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null -python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_1d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null - +python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_3d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null +# python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_2d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null +# python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_1d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null # Multi-modal dumping -python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_multimodal.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null \ No newline at end of file +python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_multimodal.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null \ No newline at end of file diff --git a/scripts/preprocess/process_structured3d.sh b/scripts/preprocess/process_structured3d.sh new file mode 100644 index 0000000..08c0605 --- /dev/null +++ b/scripts/preprocess/process_structured3d.sh @@ -0,0 +1,9 @@ +export PYTHONWARNINGS="ignore" + +# Preprocessing Object Level + Scene Level + Unified Data +python preprocessor.py --config-path /Users/gauravpradeep/CrossOver_ScaleUp/configs/preprocess --config-name process_3d.yaml data.sources=['Structured3D'] hydra.run.dir=. hydra.output_subdir=null +python preprocessor.py --config-path /Users/gauravpradeep/CrossOver_ScaleUp/configs/preprocess --config-name process_1d.yaml data.sources=['Structured3D'] hydra.run.dir=. hydra.output_subdir=null +python preprocessor.py --config-path /Users/gauravpradeep/CrossOver_ScaleUp/configs/preprocess --config-name process_2d.yaml data.sources=['Structured3D'] hydra.run.dir=. hydra.output_subdir=null + +# # Multi-modal dumping +python preprocessor.py --config-path /Users/gauravpradeep/CrossOver_ScaleUp/configs/preprocess --config-name process_multimodal.yaml data.sources=['Structured3D'] hydra.run.dir=. hydra.output_subdir=null diff --git a/single_inference/datasets/__init__.py b/single_inference/datasets/__init__.py index 9a1b744..d7126ea 100644 --- a/single_inference/datasets/__init__.py +++ b/single_inference/datasets/__init__.py @@ -1,2 +1,7 @@ from .scannet import * -from .scan3r import * \ No newline at end of file +from .scan3r import * +<<<<<<< HEAD +from .arkit import * +======= +from .multiscan import * +>>>>>>> f86c782 (adding support for multiscan) diff --git a/single_inference/datasets/arkit.py b/single_inference/datasets/arkit.py new file mode 100644 index 0000000..6434bde --- /dev/null +++ b/single_inference/datasets/arkit.py @@ -0,0 +1,126 @@ +import os.path as osp +import numpy as np +from torch.utils.data import Dataset +import MinkowskiEngine as ME +from PIL import Image +from scipy.spatial.transform import Rotation as R +from torchvision import transforms as tvf +import torch +import open3d as o3d +import pandas as pd +from common import load_utils +from util import arkit +from util import image as image_util + +class ARKitScenesInferDataset(Dataset): + def __init__(self, data_dir,voxel_size=0.02, frame_skip=5, image_size=[224, 224]) -> None: + self.voxel_size = voxel_size + self.frame_skip = frame_skip + self.image_size = image_size + + self.scans_dir = osp.join(data_dir, 'scans') + self.files_dir = osp.join(data_dir, 'files') + self.referrals = load_utils.load_json(osp.join(self.files_dir, 'sceneverse/ssg_ref_rel2_template.json')) + + self.scan_ids = [] + for split in ['train', 'val']: + filepath = osp.join(self.files_dir, '{}_scans.txt'.format(split)) + self.scan_ids.extend(np.genfromtxt(filepath, dtype = str)) + + self.base_tf = tvf.Compose([ + tvf.ToTensor(), + tvf.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + ]) + self.metadata = pd.read_csv(osp.join(self.files_dir,'metadata.csv')) + + + def extract_images(self, scan_id, color_path): + pose_data = arkit.load_poses(self.scans_dir, scan_id, skip=self.frame_skip) + frame_idxs = list(pose_data.keys()) + + pose_data_arr = [] + for frame_idx in frame_idxs: + pose = pose_data[frame_idx] + rot_quat = R.from_matrix(pose[:3, :3]).as_quat() + trans = pose[:3, 3] + pose_data_arr.append([trans[0], trans[1], trans[2], rot_quat[0], rot_quat[1], rot_quat[2], rot_quat[3]]) + + pose_data_arr = np.array(pose_data_arr) + sampled_frame_idxs = image_util.sample_camera_pos_on_grid(pose_data_arr) + sky_direction=self.metadata[self.metadata['video_id']==int(scan_id)]['sky_direction'].values[0] + + image_data = None + for idx in sampled_frame_idxs: + frame_index = frame_idxs[idx] + image = Image.open(osp.join(color_path, f'{scan_id}_{frame_index}.png')) + if sky_direction=='Left': + image = image.transpose(Image.ROTATE_270) + elif sky_direction=='Right': + image = image.transpose(Image.ROTATE_90) + image = image.resize((self.image_size[1], self.image_size[0]), Image.BICUBIC) + image_pt = self.base_tf(image).unsqueeze(0) + image_data = image_pt if image_data is None else torch.cat((image_data, image_pt), dim=0) + + return image_data.unsqueeze(0) + + def __getitem__(self, index): + if isinstance(index, int): + scan_id = self.scan_ids[index] + + if isinstance(index, str): + scan_id = index + + scan_folder = osp.join(self.scans_dir, scan_id) + data_dict = {} + data_dict['masks'] = {} + + # Point Cloud + mesh = o3d.io.read_triangle_mesh(osp.join(scan_folder, '{}_3dod_mesh.ply'.format(scan_id))) + points = np.asarray(mesh.vertices) + feats = np.asarray(mesh.vertex_colors)*255.0 + feats = feats.round() + + feats /= 255. + feats -= 0.5 + + _, sel = ME.utils.sparse_quantize(points / self.voxel_size, return_index=True) + coords, feats = points[sel], feats[sel] + coords = np.floor(coords / self.voxel_size) + coords-= coords.min(0) + + coords, feats = ME.utils.sparse_collate([coords], [feats]) + data_dict['masks']['point'] = True + + # RGB + color_path = osp.join(scan_folder, f'{scan_id}_frames','lowres_wide') + image_data = self.extract_images(scan_id, color_path) + data_dict['masks']['rgb'] = True + + # Floorplan (dummy) + floorplan_img = np.zeros((self.image_size[0], self.image_size[1], 3), dtype=np.uint8) + floorplan_img = Image.fromarray(floorplan_img) + data_dict['masks']['floorplan'] = False + + floorplan_img = floorplan_img.resize((self.image_size[1], self.image_size[0]), Image.BICUBIC) + floorplan_data = self.base_tf(floorplan_img).unsqueeze(0) + + # Referral + referrals = [referral for referral in self.referrals if referral['scan_id'] == scan_id] + if len(referrals) != 0: + if len(referrals) > 10: + referrals = np.random.choice(referrals, size=10, replace=False) + referrals = [referral['utterance'] for referral in referrals] + referrals = [' '.join(referrals)] + data_dict['masks']['referral'] = True + else: + referrals = [''] + data_dict['masks']['referral'] = False + + data_dict['coordinates'] = coords + data_dict['features'] = feats + data_dict['rgb'] = image_data + data_dict['floorplan'] = floorplan_data + data_dict['referral'] = referrals + + return data_dict \ No newline at end of file diff --git a/single_inference/datasets/multiscan.py b/single_inference/datasets/multiscan.py new file mode 100644 index 0000000..06538e6 --- /dev/null +++ b/single_inference/datasets/multiscan.py @@ -0,0 +1,120 @@ +import os.path as osp +import numpy as np +from torch.utils.data import Dataset +import MinkowskiEngine as ME +from PIL import Image +from scipy.spatial.transform import Rotation as R +from torchvision import transforms as tvf +import torch +import open3d as o3d + +from common import load_utils +from util import multiscan +from util import image as image_util + +class MultiScanInferDataset(Dataset): + def __init__(self, data_dir, voxel_size=0.02, frame_skip=1, image_size=[224, 224]) -> None: + self.voxel_size = voxel_size + self.frame_skip = frame_skip + self.image_size = image_size + + self.scans_dir = osp.join(data_dir, 'scenes') + self.files_dir = osp.join(data_dir, 'files') + self.referrals = load_utils.load_json(osp.join(self.files_dir, 'sceneverse/ssg_ref_rel2_template.json')) + + self.scan_ids = [] + for split in ['train', 'val']: + filepath = osp.join(self.files_dir, '{}_scans.txt'.format(split)) + self.scan_ids.extend(np.genfromtxt(filepath, dtype = str)) + + self.base_tf = tvf.Compose([ + tvf.ToTensor(), + tvf.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + ]) + + def extract_images(self, scan_id, color_path): + frame_idxs = multiscan.load_frame_idxs(osp.join(self.scans_dir, scan_id)) + pose_data = multiscan.load_all_poses(osp.join(self.scans_dir, scan_id), frame_idxs) + frame_idxs = list(pose_data.keys()) + + pose_data_arr = [] + for frame_idx in frame_idxs: + pose = pose_data[frame_idx] + rot_quat = R.from_matrix(pose[:3, :3]).as_quat() + trans = pose[:3, 3] + pose_data_arr.append([trans[0], trans[1], trans[2], rot_quat[0], rot_quat[1], rot_quat[2], rot_quat[3]]) + + pose_data_arr = np.array(pose_data_arr) + sampled_frame_idxs = image_util.sample_camera_pos_on_grid(pose_data_arr) + + image_data = None + for idx in sampled_frame_idxs: + frame_index = frame_idxs[idx] + image = Image.open(osp.join(color_path, f'frame-{frame_index}.color.jpg')) + image = image.resize((self.image_size[1], self.image_size[0]), Image.BICUBIC) + image_pt = self.base_tf(image).unsqueeze(0) + image_data = image_pt if image_data is None else torch.cat((image_data, image_pt), dim=0) + + return image_data.unsqueeze(0) + + def __getitem__(self, index): + if isinstance(index, int): + scan_id = self.scan_ids[index] + + if isinstance(index, str): + scan_id = index + + scan_folder = osp.join(self.scans_dir, scan_id) + data_dict = {} + data_dict['masks'] = {} + + # Point Cloud + mesh = o3d.io.read_triangle_mesh(osp.join(scan_folder, f'{scan_id}.ply')) + points = np.asarray(mesh.vertices) + feats = np.asarray(mesh.vertex_colors)*255.0 + feats = feats.round() + + feats /= 255. + feats -= 0.5 + + _, sel = ME.utils.sparse_quantize(points / self.voxel_size, return_index=True) + coords, feats = points[sel], feats[sel] + coords = np.floor(coords / self.voxel_size) + coords-= coords.min(0) + + coords, feats = ME.utils.sparse_collate([coords], [feats]) + data_dict['masks']['point'] = True + + # RGB + color_path = osp.join(scan_folder, 'sequence') + image_data = self.extract_images(scan_id, color_path) + data_dict['masks']['rgb'] = True + + # Floorplan (dummy) + floorplan_img = np.zeros((self.image_size[0], self.image_size[1], 3), dtype=np.uint8) + floorplan_img = Image.fromarray(floorplan_img) + data_dict['masks']['floorplan'] = False + + floorplan_img = floorplan_img.resize((self.image_size[1], self.image_size[0]), Image.BICUBIC) + floorplan_data = self.base_tf(floorplan_img).unsqueeze(0) + + # Referral + referrals = [referral for referral in self.referrals if referral['scan_id'] == scan_id] + if len(referrals) != 0: + if len(referrals) > 10: + referrals = np.random.choice(referrals, size=10, replace=False) + referrals = [referral['utterance'] for referral in referrals] + referrals = [' '.join(referrals)] + data_dict['masks']['referral'] = True + else: + referrals = [''] + data_dict['masks']['referral'] = False + + data_dict['coordinates'] = coords + data_dict['features'] = feats + data_dict['rgb'] = image_data + data_dict['floorplan'] = floorplan_data + data_dict['referral'] = referrals + + return data_dict \ No newline at end of file diff --git a/single_inference/scene_inference.py b/single_inference/scene_inference.py index 9846dd5..1d13b5e 100644 --- a/single_inference/scene_inference.py +++ b/single_inference/scene_inference.py @@ -26,6 +26,10 @@ def run_inference(args, scan_id=None): dataset = datasets.ScannetInferDataset(args.data_dir, args.floorplan_dir) elif args.dataset == 'Scan3R': dataset = datasets.Scan3RInferDataset(args.data_dir) + elif args.dataset == 'ARKitScenes': + dataset = datasets.ARKitScenesInferDataset(args.data_dir) + elif args.dataset == 'MultiScan': + dataset = datasets.MultiScanInferDataset(args.data_dir) else: raise NotImplementedError('Dataset not implemented') diff --git a/trainer/unified_trainer.py b/trainer/unified_trainer.py index 9b48516..d43f795 100644 --- a/trainer/unified_trainer.py +++ b/trainer/unified_trainer.py @@ -16,14 +16,23 @@ def __init__(self, cfg: DictConfig) -> None: super().__init__(cfg) self.task_config = rgetattr(cfg.task, cfg.task.name) - object_enc_ckpt = self.task_config.object_enc_ckpt + # ckpt = '/drive/dumps/multimodal-spaces/runs/new_runs/scene_crossover_scannet+scan3r_scratch.pth' + # self.logger.info(f"Loading Initial Weights from {ckpt}") + + # # Load model weights from safetensors files + # ckpt = osp.join(ckpt, 'model.safetensors') + # weights = load_file(ckpt, device = str(self.accelerator.device)) + # self.model.load_state_dict(weights) + # self.logger.info(f"Successfully loaded initial weights from {ckpt}") + + object_enc_ckpt = self.task_config.object_enc_ckpt self.logger.info(f"Loading Object Wise Modality Encoder from {str(object_enc_ckpt)}") # Load model weights from safetensors files object_enc_ckpt = osp.join(object_enc_ckpt, 'model.safetensors') object_enc_ckpt = load_file(object_enc_ckpt, device = str(self.accelerator.device)) self.model.objectwise_modality_encoder.load_state_dict(object_enc_ckpt) - self.logger.info(f"Successfully loaded from {self.task_config.object_enc_ckpt}") + self.logger.info(f"Successfully loaded Object Wise Modality Encoder from {self.task_config.object_enc_ckpt}") def train_step(self, epoch: int) -> None: self.model.train() diff --git a/util/arkit.py b/util/arkit.py new file mode 100644 index 0000000..3eb332a --- /dev/null +++ b/util/arkit.py @@ -0,0 +1,331 @@ +import os.path as osp +import numpy as np +from plyfile import PlyData +from glob import glob +import csv +import jsonlines +import json +import os +import trimesh +import pandas as pd +import cv2 + +ARKITSCENE_SCANNET= { +'bed': 'bed', +'cabinet': 'cabinet', +'refrigerator': 'refrigerator', +'table': 'table', +'chair': 'chair', +'sink': 'sink', +'stove': 'stove', +'oven': 'oven', +'washer': 'washing machine', +'shelf': 'shelf', +'tv_monitor': 'tv', +'bathtub': 'bathtub', +'toilet': 'toilet', +'sofa': 'sofa', +'stool': 'stool', +'fireplace': 'fireplace', +'build_in_cabinet': 'cabinet', +'dishwasher': 'dishwasher', +'stairs': 'stairs' +} + +def get_scan_ids(dirname, split): + filepath = osp.join(dirname, '{}_scans.txt'.format(split)) + scan_ids = np.genfromtxt(filepath, dtype = str) + return scan_ids + +def load_frame_idxs(scan_dir, skip=None): + frames_paths = glob(osp.join(scan_dir, f"{scan_dir.split('/')[-1]}_frames", 'lowres_wide', '*.png')) + frame_names = [osp.basename(frame_path) for frame_path in frames_paths] + frame_idxs = [frame_name.split('.png')[0].split("_")[1] for frame_name in frame_names] + frame_idxs.sort() + + if skip is not None: + frame_idxs = frame_idxs[::skip] + + return frame_idxs + +def TrajStringToMatrix(traj_str): + """ convert traj_str into translation and rotation matrices + Args: + traj_str: A space-delimited file where each line represents a camera position at a particular timestamp. + The file has seven columns: + * Column 1: timestamp + * Columns 2-4: rotation (axis-angle representation in radians) + * Columns 5-7: translation (usually in meters) + + Returns: + ts: translation matrix + Rt: rotation matrix + """ + # line=[float(x) for x in traj_str.split()] + # ts = line[0]; + # R = cv2.Rodrigues(np.array(line[1:4]))[0]; + # t = np.array(line[4:7]); + # Rt = np.concatenate((np.concatenate((R, t[:,np.newaxis]), axis=1), [[0.0,0.0,0.0,1.0]]), axis=0) + tokens = traj_str.split() + assert len(tokens) == 7 + ts = tokens[0] + # Rotation in angle axis + angle_axis = [float(tokens[1]), float(tokens[2]), float(tokens[3])] + r_w_to_p = convert_angle_axis_to_matrix3(np.asarray(angle_axis)) + # Translation + t_w_to_p = np.asarray([float(tokens[4]), float(tokens[5]), float(tokens[6])]) + extrinsics = np.eye(4, 4) + extrinsics[:3, :3] = r_w_to_p + extrinsics[:3, -1] = t_w_to_p + Rt = np.linalg.inv(extrinsics) + return Rt + +def convert_angle_axis_to_matrix3(angle_axis): + """Return a Matrix3 for the angle axis. + Arguments: + angle_axis {Point3} -- a rotation in angle axis form. + """ + matrix, jacobian = cv2.Rodrigues(angle_axis) + return matrix + +def load_poses(scan_dir, scan_id, skip=None): + frame_poses = {} + frame_idxs = load_frame_idxs(scan_dir, skip=skip) + traj_file = osp.join(scan_dir, f'{scan_id}_frames', 'lowres_wide.traj') + with open(traj_file) as f: + traj = f.readlines() + for i,line in enumerate(traj): + ts=line.split(" ")[0] + rounded_ts = round(float(ts), 3) + formatted_ts = f"{rounded_ts:.3f}" + if formatted_ts not in frame_idxs: + if f"{rounded_ts - 0.001:.3f}" in frame_idxs: + frame_poses[f"{rounded_ts - 0.001:.3f}"] = TrajStringToMatrix(line) + elif f"{rounded_ts + 0.001:.3f}" in frame_idxs: + frame_poses[f"{rounded_ts + 0.001:.3f}"] = TrajStringToMatrix(line) + else: + print("no matching pose for frame", formatted_ts) + continue + # if f"{round(float(ts), 3):.3f}" not in frame_idxs: + # if f"{round(float(ts), 3)-0.001 :.3f}" in frame_idxs: + # frame_poses[f"{round(float(ts), 3)-0.001:.3f}"] = TrajStringToMatrix(line) + # elif f"{round(float(ts), 3)+0.001 :.3f}" in frame_idxs: + # frame_poses[f"{round(float(ts), 3)+0.001:.3f}"] = TrajStringToMatrix(line) + # else: + # continue + else: + frame_poses[f"{round(float(ts), 3):.3f}"] = TrajStringToMatrix(line) + # data = pd.read_csv(osp.join(scan_dir,f'{scan_id}_frames','lowres_wide.traj'), delim_whitespace=True, header=None) + # for frame_idx,(index, row) in zip(frame_idxs,data.iterrows()): + # if skip is not None and index % skip != 0: + # continue + # rotation_axis = row[1:4].values + # rotation_angle = np.linalg.norm(rotation_axis) + # if rotation_angle != 0: + # rotation_axis = rotation_axis / rotation_angle + # translation = row[4:7].values + # # Convert axis-angle to rotation matrix + # # rotation_matrix = axis_angle_to_rotation_matrix(rotation_axis, rotation_angle) + # rotation_matrix= + # # Construct the 4x4 homogeneous transformation matrix + # homogenous_matrix = np.eye(4) + # homogenous_matrix[:3, :3] = rotation_matrix + # homogenous_matrix[:3, 3] = translation + # frame_poses[frame_idx] = homogenous_matrix + + return frame_poses + +def axis_angle_to_rotation_matrix(axis, angle): + # Normalize the rotation axis + axis = axis / np.linalg.norm(axis) + x, y, z = axis + c = np.cos(angle) + s = np.sin(angle) + t = 1 - c + + # Compute the rotation matrix using the axis-angle formula + rotation_matrix = np.array([ + [t*x*x + c, t*x*y - s*z, t*x*z + s*y], + [t*x*y + s*z, t*y*y + c, t*y*z - s*x], + [t*x*z - s*y, t*y*z + s*x, t*z*z + c] + ]) + + return rotation_matrix + +def load_intrinsics(data_dir, scan_id, frame_id): + ''' + Load ARKit intrinsic information + ''' + pincam_path = osp.join(data_dir, scan_id, f'{scan_id}_frames', 'lowres_wide_intrinsics', f'{scan_id}_{frame_id}.pincam') + if not os.path.exists(pincam_path): + pincam_path = osp.join(data_dir, scan_id, f'{scan_id}_frames', 'lowres_wide_intrinsics', f'{scan_id}_{float(frame_id)-0.001:.3f}.pincam') + if not os.path.exists(pincam_path): + pincam_path = osp.join(data_dir, scan_id, f'{scan_id}_frames', 'lowres_wide_intrinsics', f'{scan_id}_{float(frame_id)+0.001:.3f}.pincam') + + + intrinsics = {} + + # Read the .pincam file + with open(pincam_path, "r") as f: + line = f.readline().strip() + + # Parse the intrinsic parameters + width, height, focal_length_x, focal_length_y, principal_point_x, principal_point_y = map(float, line.split()) + + # Store the width and height + intrinsics['width'] = width + intrinsics['height'] = height + + # Construct the intrinsic matrix + intrinsic_mat = np.array([ + [focal_length_x, 0, principal_point_x], + [0, focal_length_y, principal_point_y], + [0, 0, 1] + ]) + intrinsics['intrinsic_mat'] = intrinsic_mat + + return intrinsics + +def read_label_map(metadata_dir, label_from='raw_category', label_to='nyu40id'): + LABEL_MAP_FILE = osp.join(metadata_dir, 'scannetv2-labels.combined.tsv') + assert osp.exists(LABEL_MAP_FILE) + + raw_label_map = read_label_mapping(LABEL_MAP_FILE, label_from=label_from, label_to=label_to) + return raw_label_map + +def read_label_mapping(filename, label_from='raw_category', label_to='nyu40id'): + assert osp.isfile(filename) + mapping = dict() + with open(filename) as csvfile: + reader = csv.DictReader(csvfile, delimiter='\t') + for row in reader: + mapping[row[label_from]] = row[label_to] + + if represents_int(list(mapping.keys())[0]): + mapping = {int(k):v for k,v in mapping.items()} + + return mapping + +def represents_int(s): + ''' if string s represents an int. ''' + try: + int(s) + return True + except ValueError: + return False + +def load_ply_data(data_dir, scan_id, annotations): + filename_in = osp.join(data_dir, scan_id, f'{scan_id}_3dod_mesh.ply') + file = open(filename_in, 'rb') + plydata = PlyData.read(file) + file.close() + vertices = plydata['vertex']['x'], plydata['vertex']['y'], plydata['vertex']['z'] + vertices = np.vstack(vertices).T + + vertex_colors = plydata['vertex']['red'], plydata['vertex']['green'], plydata['vertex']['blue'] + vertex_colors = np.vstack(vertex_colors).T + vertex_dtype = [('x', 'f4'), ('y', 'f4'), ('z', 'f4'), + ('red', 'u1'), ('green', 'u1'), ('blue', 'u1'), + ('objectId', 'h')] + vertices_structured = np.empty(vertices.shape[0], dtype=vertex_dtype) + + # Assign x, y, z, and color values to the structured array + vertices_structured['red'] = vertex_colors[:, 0] + vertices_structured['green'] = vertex_colors[:, 1] + vertices_structured['blue'] = vertex_colors[:, 2] + + vertex_instance = np.zeros(vertices.shape[0], dtype='h') # Use 'h' for signed 16-bit integer + bbox_list=[] + for _i, label_info in enumerate(annotations["data"]): + object_id = _i + 1 + rotation = np.array(label_info["segments"]["obbAligned"]["normalizedAxes"]).reshape(3, 3) + + transform = np.array(label_info["segments"]["obbAligned"]["centroid"]).reshape(-1, 3) + scale = np.array(label_info["segments"]["obbAligned"]["axesLengths"]).reshape(-1, 3) + + trns = np.eye(4) + trns[0:3, 3] = transform + trns[0:3, 0:3] = rotation.T + + box_trimesh_fmt = trimesh.creation.box(scale.reshape(3,), trns) + obj_containment = np.argwhere(box_trimesh_fmt.contains(vertices)) + + vertex_instance[obj_containment] = object_id + box3d = compute_box_3d(scale.reshape(3).tolist(), transform, rotation) + bbox_list.append(box3d) + + vertices_structured['objectId'] = vertex_instance + if np.max(vertex_colors) <= 1: + vertex_colors = vertex_colors * 255.0 + + + vertices_structured['x'] = plydata['vertex']['x'] + vertices_structured['y'] = plydata['vertex']['y'] + vertices_structured['z'] = plydata['vertex']['z'] + + return vertices_structured + +def compute_box_3d(size, center, rotmat): + """Compute corners of a single box from rotation matrix + Args: + size: list of float [dx, dy, dz] + center: np.array [x, y, z] + rotmat: np.array (3, 3) + Returns: + corners: (8, 3) + """ + l, h, w = [i / 2 for i in size] + center = np.reshape(center, (-1, 3)) + center = center.reshape(3) + x_corners = [l, l, -l, -l, l, l, -l, -l] + y_corners = [h, -h, -h, h, h, -h, -h, h] + z_corners = [w, w, w, w, -w, -w, -w, -w] + corners_3d = np.dot( + np.transpose(rotmat), np.vstack([x_corners, y_corners, z_corners]) + ) + corners_3d[0, :] += center[0] + corners_3d[1, :] += center[1] + corners_3d[2, :] += center[2] + return np.transpose(corners_3d) + +def rotate_z_axis_by_degrees(pointcloud, theta, clockwise=True): + theta = np.deg2rad(theta) + cos_t = np.cos(theta) + sin_t = np.sin(theta) + rot_matrix = np.array([[cos_t, -sin_t, 0], + [sin_t, cos_t, 0], + [0, 0, 1]], pointcloud.dtype) + if not clockwise: + rot_matrix = rot_matrix.T + return pointcloud.dot(rot_matrix) + +def calc_align_matrix(bbox_list): + RANGE = [-45, 45] + NUM_BIN = 90 + angles = np.linspace(RANGE[0], RANGE[1], NUM_BIN) + angle_counts = {} + for _a in angles: + bucket = round(_a, 3) + for box in bbox_list: + box_r = rotate_z_axis_by_degrees(box, bucket) + bottom = box_r[4:] + if is_axis_aligned(bottom): + angle_counts[bucket] = angle_counts.get(bucket, 0) + 1 + if len(angle_counts) == 0: + RANGE = [-90, 90] + NUM_BIN = 180 + angles = np.linspace(RANGE[0], RANGE[1], NUM_BIN) + for _a in angles: + bucket = round(_a, 3) + for box in bbox_list: + box_r = rotate_z_axis_by_degrees(box, bucket) + bottom = box_r[4:] + if is_axis_aligned(bottom, thres=0.15): + angle_counts[bucket] = angle_counts.get(bucket, 0) + 1 + most_common_angle = max(angle_counts, key=angle_counts.get) + return most_common_angle + +def is_axis_aligned(rotated_box, thres=0.05): + x_diff = abs(rotated_box[0][0] - rotated_box[1][0]) + y_diff = abs(rotated_box[0][1] - rotated_box[3][1]) + return x_diff < thres and y_diff < thres diff --git a/util/multiscan.py b/util/multiscan.py new file mode 100644 index 0000000..5ce872e --- /dev/null +++ b/util/multiscan.py @@ -0,0 +1,698 @@ +import os.path as osp +import numpy as np +from plyfile import PlyData +from glob import glob +import csv +import jsonlines +import json +import os +import pandas as pd + +MULTISCAN_SCANNET = { + "wall": "wall", + "door": "door", + "slippers": "shoe", + "mop": "broom", + "rug": "rug", + "floor": "floor", + "basin": "sink", + "basin_stand": "sink", + "bucket": "bucket", + "shower": "shower", + "water_tank": "container", + "beam": "wood beam", + "pillar": "pillar", + "ceiling": "ceiling", + "sink": "sink", + "toilet": "toilet", + "cabinet": "cabinet", + "remove": "object", + "towel": "towel", + "pillow": "pillow", + "sofa": "sofa", + "footstool": "footstool", + "picture": "picture", + "window": "window", + "heater": "heater", + "mirror": "mirror", + "pipe": "pipe", + "scarf": "cloth", + "ceiling_light": "ceiling light", + "chair": "chair", + "table": "table", + "vent": "vent", + "bag": "bag", + "wall_cabinet": "cabinet", + "range": "stove", + "ricemaker": "rice cooker", + "pan": "cooking pan", + "coffee_machine": "coffee maker", + "rice_bag": "bag", + "light": "light", + "trashbin": "trash bin", + "kettle": "kettle", + "refrigerator": "refrigerator", + "microwave": "microwave", + "light_switch": "light switch", + "rice_cooker": "rice cooker", + "box": "box", + "shoe": "shoe", + "range_hood": "range hood", + "wok": "cooking pan", + "router": "object", + "paper_towel": "paper towel roll", + "stock_pot": "pot", + "cutting_board": "cutting board", + "wall_calendar": "calendar", + "baseboard": "object", + "coke_box": "box", + "printer": "printer", + "bowl": "bowl", + "backpack": "backpack", + "baseboard_heater": "heater", + "broom": "broom", + "dust_pan": "dustpan", + "trash_bin": "trash bin", + "rigid_duct": "vent", + "electric_range": "stove", + "spatula": "object", + "faucet": "faucet", + "bottle": "bottle", + "countertop": "counter", + "railing": "railing", + "suitcase": "suitcase", + "trash": "trash can", + "pot": "pot", + "kitchen_tool": "object", + "vegetable": "object", + "board": "board", + "washing_machine": "washing machine", + "jar": "jar", + "object": "object", + "notebook": "book", + "induction_cooker": "stove", + "instant_pot_lid": "cooking pot", + "oven": "oven", + "air_fryer": "object", + "lid": "pot", + "sponge": "sponge", + "blender": "object", + "spoon": "object", + "dishwasher": "dishwasher", + "detergent": "laundry detergent", + "watermelon": "bananas", + "yard_waste_bag": "garbage bag", + "container": "container", + "newspapers": "paper", + "rag": "cloth", + "ladder": "ladder", + "gate": "door", + "napkin_box": "tissue box", + "jacket": "jacket", + "windowsill": "windowsill", + "water_faucet": "faucet", + "steel_ball": "ball", + "rice_maker": "rice cooker", + "watter_bottle": "water bottle", + "plastic_bag": "bag", + "paper_bag": "paper bag", + "cuttting_board": "cutting board", + "trash_bin_lid": "trash bin", + "hair_dryer": "hair dryer", + "electric_socket": "power outlet", + "electric_panel": "electric panel", + "wash_stand": "sink", + "soap": "soap", + "curtain": "curtain", + "bathtub": "bathtub", + "smoke_detector": "smoke detector", + "roll_paper": "paper towel roll", + "chandelier": "chandelier", + "hand_sanitizer": "hand sanitzer dispenser", + "plate": "plate", + "sticker": "sticker", + "power_socket": "power outlet", + "stacked_cups": "stack of cups", + "stacked_chairs": "stack of chairs", + "air_vent": "vent", + "cornice": "cabinet", + "wine_cabinet": "kitchen cabinet", + "crock": "bowl", + "liquor_box": "cabinet", + "shampoo": "shampoo", + "shower_curtain": "shower curtain", + "wall_light": "wall lamp", + "sink_cabinet": "sink", + "toilet_roll": "toilet paper", + "shelf": "shelf", + "paper_bin": "recycling bin", + "toilet_brush": "toilet brush", + "shower_head": "shower head", + "tv": "tv", + "remote_control": "remote", + "tv_box": "tv stand", + "nightstand": "nightstand", + "bed": "bed", + "quilt": "blanket", + "telephone": "telephone", + "monitor": "monitor", + "desk": "desk", + "radiator_shell": "radiator", + "calendar": "calendar", + "clock": "clock", + "keyboard": "keyboard", + "speaker": "speaker", + "clothes": "clothes", + "door_frame": "doorframe", + "sliding_door": "sliding door", + "ceiling_lamp": "ceiling lamp", + "scale": "scale", + "power_strip": "power strip", + "switch": "light switch", + "basket": "basket", + "stool": "stool", + "shoes": "shoe", + "slipper": "slippers", + "bifold_door": "door", + "rangehood": "range hood", + "books": "books", + "toilet_paper": "toilet paper", + "mouse_pad": "mouse", + "ipad": "ipad", + "scissor": "knife block", + "radiator": "radiator", + "pc": "computer tower", + "bicycle": "bicycle", + "wardrobe": "wardrobe", + "mouse": "mouse", + "advertising_board": "poster", + "banner": "banner", + "ceiling_decoration": "ceiling light", + "whiteboard": "whiteboard", + "wall_storage_set": "shelf", + "traffic_cone": "traffic cone", + "wall_decoration": "decoration", + "papers": "papers", + "hat": "hat", + "velvet_hangers": "clothes hanger", + "circular_plate": "plate", + "cellphone": "telephone", + "pen": "keyboard piano", + "paper": "paper", + "lamp": "lamp", + "curtain_box": "curtains", + "woodcarving": "wood", + "scissors": "knife block", + "hand_dryer": "hand dryer", + "machine": "machine", + "vase": "vase", + "plant": "plant", + "power_socket_case": "power outlet", + "gloves": "clothes", + "dishcloth": "cloth", + "painting": "painting", + "shower_wall": "shower wall", + "showerhead": "shower head", + "tooth_mug": "cup", + "map": "map", + "knot_artwork": "decoration", + "fan": "fan", + "sphygmomanometer": "scale", + "electric_kettle": "kettle", + "bread_maker": "oven", + "knife_set": "knife block", + "soup_pot": "cooking pot", + "flatware_set": "cutting board", + "candle": "candle", + "lid_rack": "dish rack", + "flower": "flowerpot", + "can": "can", + "scoop": "bowl", + "laptop": "laptop", + "glass": "glass doors", + "wet_floor_sign": "wet floor sign", + "shower_enclosure": "shower doors", + "jewelry_box": "jewelry box", + "bath_brush": "hair brush", + "sofa_cushion": "couch cushions", + "tv_cabinet": "tv stand", + "wood_fence": "wood beam", + "floor_lamp": "lamp", + "computer_case": "computer tower", + "waste_container": "trash bin", + "roadblock": "barricade", + "trash_can_lids": "trash can", + "hand_sanitizer_stand": "soap dispenser", + "air_conditioner": "conditioner bottle", + "pattern": "rug", + "remote_controller": "remote", + "phone": "telephone", + "speakers": "speaker", + "table_divider": "divider", + "table_card": "card", + "paper_trimmer": "paper cutter", + "stapler": "stapler", + "cup": "cup", + "bathroom_heater": "heater", + "wall_shelf": "shelf", + "towel_rack": "towel", + "sink_drain": "sink", + "floor_drain": "floor", + "broom_head": "broom", + "door_curtain": "curtain", + "refill_pouch": "plastic container", + "bin": "bin", + "stall_wall": "bathroom stall door", + "wall_speaker": "speaker", + "laundry_basket": "laundry basket", + "tissue_box": "tissue box", + "document_holder": "file cabinet", + "yoga_mat": "yoga mat", + "gas_range": "stove", + "chopping_board": "cutting board", + "book_scanner": "scanner", + "payment_terminal": "vending machine", + "napkin_roll": "paper towel roll", + "faucet_switch": "faucet", + "glass_door": "glass doors", + "carpet": "carpet", + "shower_floor": "shower floor", + "toilet_plunger": "plunger", + "plug_panel": "power outlet", + "stand": "stand", + "potted_plant": "potted plant", + "poster": "poster", + "isolation_board": "divider", + "soap_holder": "soap dish", + "plug": "power outlet", + "brush": "hair brush", + "threshold": "doorframe", + "air_conditioner_controller": "remote", + "iron": "iron", + "ironing_board": "ironing board", + "safe": "suitcase", + "gas_cooker": "stove", + "pressure_cooker": "cooking pot", + "steamer_pot": "pot", + "soy_sauce_bottle": "bottle", + "dishwashing_liquid": "dishwashing soap bottle", + "water_ladle": "bowl", + "power_socket_set": "power strip", + "kitchen_tool_holder": "kitchen cabinet", + "case": "case", + "wall_paper": "wall", + "comb": "hair brush", + "paper_cutter": "paper cutter", + "pencil_sharpener": "pen holder", + "sealing_machine": "machine", + "poster_board": "poster", + "shredder": "shredder", + "footstep": "stair", + "planter": "plant", + "floor_light": "lamp", + "paper_cup": "cup", + "divider": "divider", + "hanger": "clothes hanger", + "glove": "clothing", + "blanket": "blanket", + "remote": "remote", + "cloth": "cloth", + "clutter": "object", + "extinguisher": "fire extinguisher", + "dryer": "clothes dryer", + "soap_bottle": "soap bottle", + "fabric_softener_box": "box", + "dryer_sheet_box": "box", + "detergent_bottle": "laundry detergent", + "toaster": "toaster", + "stacked_bowls": "bowl", + "pot_lid": "pot", + "electric_pressure_cooker": "rice cooker", + "bread": "food display", + "bagels": "object", + "oranges": "bananas", + "card_reader": "card", + "whiteboard_detergent": "soap dispenser", + "power_outlet": "power outlet", + "bouquet": "vase", + "water_bottle": "water bottle", + "wall_mounted_telephone": "telephone", + "fridge": "refrigerator", + "toy": "toy dinosaur", + "shoe_box": "box", + "hole_puncher": "paper cutter", + "landline_telephone": "telephone", + "base": "stand", + "handkerchief": "cloth", + "cornice_molding": "frame", + "bathtub_base": "bathtub", + "bidet": "toilet", + "pedestal_urinal": "urinal", + "pedestal_urinal_covered": "urinal", + "pit_toilet": "toilet", + "low_wall": "wall", + "rail": "rail", + "bottles": "bottles", + "floor_otherroom": "floor", + "wall_otherroom": "wall", + "canopy": "canopy", + "cable_manager": "cable", + "sneakers": "shoes", + "purse": "purse", + "cushion": "cushion", + "napkin": "towel", + "plush_toy": "stuffed animal", + "adjustable_desk": "desk", + "tableware": "plates", + "computer_desk": "desk", + "cat_kennel": "cat litter box", + "back_cushion": "pillow", + "ukulele_bag": "guitar case", + "litter_box": "trash can", + "storage_box": "storage bin", + "toy_doll": "doll", + "drawer_unit": "drawer", + "doll": "stuffed animal", + "laptop_bag": "messenger bag", + "clothing_rack": "clothing rack", + "bookshelf": "bookshelves", + "mask": "cloth", + "watch": "clock", + "book": "books", + "ashtray": "tray", + "car_key": "car", + "wallet": "purse", + "tea_pot": "tea kettle", + "wire": "cable", + "rake": "broom", + "dispenser": "soap dispenser", + "toilet_tank": "toilet", + "door_sill": "doorframe", + "cleanser": "soap", + "armrest": "armchair", + "short_wall": "wall", + "suspended_ceiling": "ceiling", + "fire_extinguisher_cabinet": "fire extinguisher", + "plastic_box": "plastic container", + "sanitation_station": "soap dispenser", + "plant_pot": "flowerpot", + "fireplace": "fireplace", + "computer_table": "desk", + "tissue_bag": "tissue box", + "wall_frame": "frame", + "map_board": "map", + "automated_teller_machine": "vending machine", + "ticket": "card", + "tablet": "ipad", + "blankets": "blanket", + "bags": "bag", + "flag": "flag", + "blackboard": "blackboard", + "bar_table": "bar", + "cardboard_holder": "cardboard", + "potted_planet": "potted plant", + "tray": "tray", + "utensil_holder": "kitchen counter", + "bird_ceramics": "statue", + "shirt": "shirt", + "clothes_rail": "clothes hanger", + "power_strips": "power strip", + "card_board": "board", + "pile_of_blankets": "blanket", + "bed_net": "bed", + "umbrella": "umbrella", + "dragon_fruit": "bananas", + "tissue": "tissue box", + "electrical_panel": "electric panel", + "panel": "door", + "tube": "tube", + "pile_of_cloth": "cloth", + "surface": "table", + "chair_cushion": "cushion", + "guide": "book", + "parapet": "railing", + "camera": "camera", + "light_base": "lamp base", + "first_aid": "object", + "bench": "bench", + "potted_plants": "potted plant", + "pot_cover": "pot", + "yoga_mat_roll": "yoga mat", + "panda_doll": "stuffed animal", + "window_trim": "window", + "shoe_cabinet": "shoe rack", + "toilet_paper_holder": "toilet paper dispenser", + "shower_faucet": "shower faucet handle", + "bath_sponge": "sponge", + "ornament": "decoration", + "planter_box": "plant", + "cooktop": "stove", + "knife_block": "knife block", + "step_stool": "step stool", + "touchpad": "keyboard", + "light_box": "light", + "sound": "speaker", + "exhaust_fan_vent": "vent", + "paperbin": "recycling bin", + "mop_bucket": "bucket", + "sneaker": "shoes", + "objects": "object", + "cd_tray": "cd case", + "wall_board": "board", + "room_divider": "divider", + "paiting": "painting", + "cabinet_otherroom": "cabinet", + "electric_switch": "light switch", + "sign": "exit sign", + "hand_soap": "soap bottle", + "window_blinds": "blinds" +} + +def read_label_map(metadata_dir, label_from='raw_category', label_to='nyu40id'): + LABEL_MAP_FILE = osp.join(metadata_dir, 'scannetv2-labels.combined.tsv') + assert osp.exists(LABEL_MAP_FILE) + + raw_label_map = read_label_mapping(LABEL_MAP_FILE, label_from=label_from, label_to=label_to) + return raw_label_map + +def read_label_mapping(filename, label_from='raw_category', label_to='nyu40id'): + assert osp.isfile(filename) + mapping = dict() + with open(filename) as csvfile: + reader = csv.DictReader(csvfile, delimiter='\t') + for row in reader: + mapping[row[label_from]] = row[label_to] + + if represents_int(list(mapping.keys())[0]): + mapping = {int(k):v for k,v in mapping.items()} + + return mapping + +def get_scan_ids(dirname, split): + filepath = osp.join(dirname, '{}_scans.txt'.format(split)) + scan_ids = np.genfromtxt(filepath, dtype = str) + return scan_ids + +def annotations_to_dataframe_obj(annotations): + objects = annotations['objects'] + df_list = [] + for obj in objects: + object_id = obj['objectId'] + object_label = obj['label'] + df_row = pd.DataFrame( + [[object_id, object_label]], + columns=['objectId', 'objectLabel'] + ) + df_list.append(df_row) + df = pd.concat(df_list) + return df + + +def load_ply_data(data_dir, scan_id): + """ + Load PLY data and propagate object IDs from faces to vertices. + + Args: + data_dir (str): Directory containing the PLY file. + scan_id (str): Identifier for the scan. + + Returns: + np.ndarray: Vertex data with propagated object IDs. + """ + # with open(osp.join(data_dir, scan_id, f'{scan_id}.annotations.json'), "r", encoding='utf-8') as f: + # annotations = json.load(f) + + filename_in = osp.join(data_dir, scan_id, '{}.ply'.format(scan_id)) + + if not osp.exists(filename_in): + raise FileNotFoundError(f"PLY file not found: {filename_in}") + + with open(filename_in, 'rb') as file: + ply_data = PlyData.read(file) + + # Extract vertex properties + x = np.array(ply_data['vertex']['x']) + y = np.array(ply_data['vertex']['y']) + z = np.array(ply_data['vertex']['z']) + red = np.array(ply_data['vertex']['red']) + green = np.array(ply_data['vertex']['green']) + blue = np.array(ply_data['vertex']['blue']) + + # Extract normals if available + if 'nx' in ply_data['vertex'] and 'ny' in ply_data['vertex'] and 'nz' in ply_data['vertex']: + nx = np.array(ply_data['vertex']['nx']) + ny = np.array(ply_data['vertex']['ny']) + nz = np.array(ply_data['vertex']['nz']) + normals = np.stack([nx, ny, nz], axis=-1) + else: + normals = None + + + vertex_object_ids = np.full(len(x), -1, dtype='int32') + + # Extract face data + faces = ply_data['face'].data + face_vertex_indices = [face['vertex_indices'] for face in faces] + face_object_ids = [face['objectId'] for face in faces] + + # Propagate object IDs to vertices + for face_indices, obj_id in zip(face_vertex_indices, face_object_ids): + vertex_object_ids[face_indices] = obj_id # Assign object ID to all vertices in the face + + + vertex_dtype = [ + ('x', 'f4'), ('y', 'f4'), ('z', 'f4'), # Coordinates + ('red', 'u1'), ('green', 'u1'), ('blue', 'u1'), # Colors + ('objectId', 'i4') # Propagated Object ID + ] + + if normals is not None: + vertex_dtype.extend([('nx', 'f4'), ('ny', 'f4'), ('nz', 'f4')]) # Normals + + vertices = np.empty(len(x), dtype=vertex_dtype) + vertices['x'] = x.astype('f4') + vertices['y'] = y.astype('f4') + vertices['z'] = z.astype('f4') + vertices['red'] = red.astype('u1') + vertices['green'] = green.astype('u1') + vertices['blue'] = blue.astype('u1') + vertices['objectId'] = vertex_object_ids.astype('i4') + + if normals is not None: + vertices['nx'] = normals[:, 0].astype('f4') + vertices['ny'] = normals[:, 1].astype('f4') + vertices['nz'] = normals[:, 2].astype('f4') + + return vertices + +def load_meta_intrinsics(scan_dir, scene_id, stream_type="color_camera"): + ''' + Load MultiScan intrinsic information + ''' + meta_intrinsics_path = osp.join(scan_dir, f'{scene_id}.json') + intrinsics = {} + + with open(meta_intrinsics_path,"r") as f: + json_data=json.load(f) + + for stream in json_data.get("streams", []): + if stream.get("type") == stream_type: + intrinsic_mat = np.array(stream.get("intrinsics")) + intrinsic_mat = np.reshape(intrinsic_mat, (3, 3), order='F') + intrinsics['intrinsic_mat']=intrinsic_mat + resolution = stream.get("resolution") + width, height = resolution[1], resolution[0] # [width, height] + intrinsics['width']=float(width) + intrinsics['height']=float(height) + + return intrinsics + +def load_intrinsics(scan_dir, scene_id, frame_id, stream_type="color_camera"): + ''' + Load MultiScan intrinsic information + ''' + intrinsics_path = osp.join(scan_dir, 'poses.jsonl') + resoultion_path = osp.join(scan_dir, f'{scene_id}.json') + intrinsics = {} + + with open(resoultion_path,"r") as f: + json_data=json.load(f) + + for stream in json_data.get("streams", []): + if stream.get("type") == stream_type: + resolution = stream.get("resolution", None) + if resolution: + width, height = resolution[1], resolution[0] # [width, height] + intrinsics['width']=float(width) + intrinsics['height']=float(height) + + + with jsonlines.open(intrinsics_path) as reader: + for entry in reader: + if entry.get("frame_id") == frame_id: + intrinsic_mat = np.asarray(entry.get('intrinsics')) + intrinsic_mat = np.reshape(intrinsic_mat, (3, 3), order='F') + intrinsics['intrinsic_mat']=intrinsic_mat + break + + return intrinsics + +def load_pose(scan_dir, frame_id): + # Find alignment file + alignment_path = None + for file_name in os.listdir(scan_dir): + if file_name.endswith('.align.json'): + alignment_path = osp.join(scan_dir, file_name) + break + + if alignment_path is None: + raise FileNotFoundError(f"No alignment file found in {scan_dir}") + + with open(alignment_path, "r") as f: + alignment_data = json.load(f) + if 'coordinate_transform' not in alignment_data: + raise ValueError(f"Alignment file {alignment_path} does not contain 'coordinate_transform'") + coordinate_transform = np.reshape(alignment_data['coordinate_transform'], (4, 4), order='F') + inv_transform = np.linalg.inv(coordinate_transform) + + pose_path = osp.join(scan_dir, 'poses.jsonl') + with jsonlines.open(pose_path) as reader: + for entry in reader: + if entry.get("frame_id") == frame_id: + transform = np.asarray(entry.get('transform')) + transform = np.reshape(transform, (4, 4), order='F') + transform = np.dot(transform, np.diag([1, -1, -1, 1])) + transform = transform / transform[3][3] + aligned_pose = inv_transform @ transform #align camera poses + return aligned_pose + + raise ValueError(f"Pose for frame_id {frame_id} not found in {pose_path}") + + +def load_all_poses(scan_dir, frame_idxs): + frame_poses = {} + for frame_idx in frame_idxs: + frame_pose = load_pose(scan_dir, int(frame_idx)) + frame_poses[frame_idx] = frame_pose + return frame_poses + +def load_frame_idxs(scan_dir, skip=None): + frames_paths = glob(osp.join(scan_dir, 'sequence', '*.jpg')) + frame_names = [osp.basename(frame_path) for frame_path in frames_paths] + frame_idxs = [frame_name.split('.')[0].split('-')[-1] for frame_name in frame_names] + frame_idxs.sort() + + if skip is None: + frame_idxs = frame_idxs + else: + frame_idxs = [frame_idx for frame_idx in frame_idxs[::skip]] + return frame_idxs + + +def represents_int(s): + ''' if string s represents an int. ''' + try: + int(s) + return True + except ValueError: + return False \ No newline at end of file diff --git a/util/scan3r.py b/util/scan3r.py index 2727d5a..31684aa 100644 --- a/util/scan3r.py +++ b/util/scan3r.py @@ -3,6 +3,8 @@ from plyfile import PlyData from glob import glob import csv +import json +import trimesh def get_scan_ids(dirname: str, split: str) -> np.ndarray: """Retrieve scan IDs for the given directory and split.""" @@ -10,34 +12,54 @@ def get_scan_ids(dirname: str, split: str) -> np.ndarray: scan_ids = np.genfromtxt(filepath, dtype = str) return scan_ids -def load_ply_data(data_dir: str, scan_id: str, label_file_name: str) -> np.ndarray: - """Load PLY data from specified directory, scan ID, and label file.""" +def load_ply_data(data_dir, scan_id, label_file_name): filename_in = osp.join(data_dir, scan_id, label_file_name) file = open(filename_in, 'rb') ply_data = PlyData.read(file) file.close() x = ply_data['vertex']['x'] - y = ply_data['vertex']['y'] - z = ply_data['vertex']['z'] - red = ply_data['vertex']['red'] - green = ply_data['vertex']['green'] - blue = ply_data['vertex']['blue'] + # y = ply_data['vertex']['y'] + # z = ply_data['vertex']['z'] + # red = ply_data['vertex']['red'] + # green = ply_data['vertex']['green'] + # blue = ply_data['vertex']['blue'] object_id = ply_data['vertex']['objectId'] global_id = ply_data['vertex']['globalId'] nyu40_id = ply_data['vertex']['NYU40'] eigen13_id = ply_data['vertex']['Eigen13'] rio27_id = ply_data['vertex']['RIO27'] - vertices = np.empty(len(x), dtype=[('x', 'f4'), ('y', 'f4'), ('z', 'f4'), ('red', 'u1'), ('green', 'u1'), ('blue', 'u1'), + obj_mesh = trimesh.load(osp.join(data_dir, scan_id, 'mesh.refined.v2.obj')) + + obj_mesh_points = np.asarray(obj_mesh.vertices) + obj_mesh_colors = obj_mesh.visual.to_color().vertex_colors[:,:3] + + min_vertices = min(len(object_id), len(x), obj_mesh_points.shape[0]) + + obj_mesh_points = obj_mesh_points[:min_vertices] + object_ids = object_id[:min_vertices] + obj_mesh_colors = obj_mesh_colors[:min_vertices] + global_id = global_id[:min_vertices] + nyu40_id = nyu40_id[:min_vertices] + eigen13_id = eigen13_id[:min_vertices] + rio27_id = rio27_id[:min_vertices] + + vertices = np.empty(min_vertices, dtype=[('x', 'f4'), ('y', 'f4'), ('z', 'f4'), ('red', 'u1'), ('green', 'u1'), ('blue', 'u1'), ('objectId', 'h'), ('globalId', 'h'), ('NYU40', 'u1'), ('Eigen13', 'u1'), ('RIO27', 'u1')]) - vertices['x'] = x.astype('f4') - vertices['y'] = y.astype('f4') - vertices['z'] = z.astype('f4') - vertices['red'] = red.astype('u1') - vertices['green'] = green.astype('u1') - vertices['blue'] = blue.astype('u1') - vertices['objectId'] = object_id.astype('h') + # vertices['x'] = x.astype('f4') + # vertices['y'] = y.astype('f4') + # vertices['z'] = z.astype('f4') + # vertices['red'] = red.astype('u1') + # vertices['green'] = green.astype('u1') + # vertices['blue'] = blue.astype('u1') + vertices['x'] = obj_mesh_points[:, 0].astype('f4') + vertices['y'] = obj_mesh_points[:, 1].astype('f4') + vertices['z'] = obj_mesh_points[:, 2].astype('f4') + vertices['red'] = obj_mesh_colors[:, 0].astype('u1') + vertices['green'] = obj_mesh_colors[:, 1].astype('u1') + vertices['blue'] = obj_mesh_colors[:, 2].astype('u1') + vertices['objectId'] = object_ids.astype('h') vertices['globalId'] = global_id.astype('h') vertices['NYU40'] = nyu40_id.astype('u1') vertices['Eigen13'] = eigen13_id.astype('u1') @@ -136,4 +158,69 @@ def represents_int(s: str) -> bool: int(s) return True except ValueError: - return False \ No newline at end of file + return False + +def calc_align_matrix(bbox_list): + RANGE = [-45, 45] + NUM_BIN = 90 + angles = np.linspace(RANGE[0], RANGE[1], NUM_BIN) + angle_counts = {} + for _a in angles: + bucket = round(_a, 3) + for box in bbox_list: + box_r = rotate_z_axis_by_degrees(box, bucket) + bottom = box_r[4:] + if is_axis_aligned(bottom): + angle_counts[bucket] = angle_counts.get(bucket, 0) + 1 + if len(angle_counts) == 0: + RANGE = [-90, 90] + NUM_BIN = 180 + angles = np.linspace(RANGE[0], RANGE[1], NUM_BIN) + for _a in angles: + bucket = round(_a, 3) + for box in bbox_list: + box_r = rotate_z_axis_by_degrees(box, bucket) + bottom = box_r[4:] + if is_axis_aligned(bottom, thres=0.15): + angle_counts[bucket] = angle_counts.get(bucket, 0) + 1 + most_common_angle = max(angle_counts, key=angle_counts.get) + return most_common_angle + +def is_axis_aligned(rotated_box, thres=0.05): + x_diff = abs(rotated_box[0][0] - rotated_box[1][0]) + y_diff = abs(rotated_box[0][1] - rotated_box[3][1]) + return x_diff < thres and y_diff < thres + +def rotate_z_axis_by_degrees(pointcloud, theta, clockwise=True): + theta = np.deg2rad(theta) + cos_t = np.cos(theta) + sin_t = np.sin(theta) + rot_matrix = np.array([[cos_t, -sin_t, 0], + [sin_t, cos_t, 0], + [0, 0, 1]], pointcloud.dtype) + if not clockwise: + rot_matrix = rot_matrix.T + return pointcloud.dot(rot_matrix) + +def compute_box_3d(size, center, rotmat): + """Compute corners of a single box from rotation matrix + Args: + size: list of float [dx, dy, dz] + center: np.array [x, y, z] + rotmat: np.array (3, 3) + Returns: + corners: (8, 3) + """ + l, h, w = [i / 2 for i in size] + center = np.reshape(center, (-1, 3)) + center = center.reshape(3) + x_corners = [l, l, -l, -l, l, l, -l, -l] + y_corners = [h, -h, -h, h, h, -h, -h, h] + z_corners = [w, w, w, w, -w, -w, -w, -w] + corners_3d = np.dot( + np.transpose(rotmat), np.vstack([x_corners, y_corners, z_corners]) + ) + corners_3d[0, :] += center[0] + corners_3d[1, :] += center[1] + corners_3d[2, :] += center[2] + return np.transpose(corners_3d) \ No newline at end of file diff --git a/util/structured3d.py b/util/structured3d.py new file mode 100644 index 0000000..34d7c36 --- /dev/null +++ b/util/structured3d.py @@ -0,0 +1,169 @@ +import os.path as osp +import numpy as np +from plyfile import PlyData +from glob import glob +import cv2 + +S3D_SCANNET = { + 1: 'wall', + 2: 'floor', + 3: 'cabinet', + 4: 'bed', + 5: 'chair', + 6: 'sofa', + 7: 'table', + 8: 'door', + 9: 'window', + 10: 'bookshelf', + 11: 'picture', + 12: 'counter', + 13: 'blinds', + 14: 'desk', + 15: 'shelf', + 16: 'curtain', + 17: 'dresser', + 18: 'pillow', + 19: 'mirror', + 20: 'mat', + 21: 'clothes', + 22: 'ceiling', + 23: 'books', + 24: 'refrigerator', + 25: 'tv', + 26: 'paper', + 27: 'towel', + 28: 'shower curtain', + 29: 'box', + 30: 'whiteboard', + 31: 'person', + 32: 'nightstand', + 33: 'toilet', + 34: 'sink', + 35: 'lamp', + 36: 'bathtub', + 37: 'bag', + 38: 'otherstructure', + 39: 'otherfurniture', + 40: 'otherprop'} + +def get_scan_ids(dirname, split): + filepath = osp.join(dirname, '{}_scans.txt'.format(split)) + scan_ids = np.genfromtxt(filepath, dtype = str) + return scan_ids + +def load_ply_data(data_dir, scan_id, room_id): + + filename_in = osp.join(data_dir, scan_id, '3D_rendering', room_id, 'room_mesh.ply') + if not osp.exists(filename_in): + raise FileNotFoundError(f"PLY file not found: {filename_in}") + + with open(filename_in, 'rb') as file: + ply_data = PlyData.read(file) + + x = np.array(ply_data['vertex']['x']) + y = np.array(ply_data['vertex']['y']) + z = np.array(ply_data['vertex']['z']) + red = np.array(ply_data['vertex']['red']) + green = np.array(ply_data['vertex']['green']) + blue = np.array(ply_data['vertex']['blue']) + vertex_object_ids = np.array(ply_data['vertex']['object_id']) + vertex_nyu40ids = np.array(ply_data['vertex']['nyu40id']) + # vertex_targetids = np.array(ply_data['vertex']['target_id']) + + vertex_dtype = [ + ('x', 'f4'), ('y', 'f4'), ('z', 'f4'), + ('red', 'u1'), ('green', 'u1'), ('blue', 'u1'), ('alpha', 'u1'), + ('objectId', 'i4'), + ('nyu40id', 'i4') + # ('targetId', 'i4') + ] + + # scene_vertices = np.column_stack([x, y, z]) + # center_points = np.mean(scene_vertices, axis=0) + # center_points[2] = np.min(scene_vertices[:, 2]) + # scene_vertices = scene_vertices - center_points + + vertices = np.empty(len(x), dtype=vertex_dtype) + + # vertices['x'] = scene_vertices[:, 0].astype('f4') + # vertices['y'] = scene_vertices[:, 1].astype('f4') + # vertices['z'] = scene_vertices[:, 2].astype('f4') + + vertices['x'] = x.astype('f4') + vertices['y'] = y.astype('f4') + vertices['z'] = z.astype('f4') + + vertices['red'] = red.astype('u1') + vertices['green'] = green.astype('u1') + vertices['blue'] = blue.astype('u1') + vertices['objectId'] = vertex_object_ids.astype('i4') + vertices['nyu40id'] = vertex_nyu40ids.astype('i4') + # vertices['targetId'] = np.zeros_like(x).astype('i4') + # vertices['targetId'] = vertex_targetids.astype('i4') + return vertices + +def normalize(vector): + return vector / np.linalg.norm(vector) + + +def parse_camera_info(camera_info, height, width): + """ extract intrinsic and extrinsic matrix + """ + lookat = normalize(camera_info[3:6]) + up = normalize(camera_info[6:9]) + + W = lookat + U = np.cross(W, up) + V = np.cross(W, U) + + rot = np.vstack((U, V, W)) + + trans = camera_info[:3] + + xfov = camera_info[9] + yfov = camera_info[10] + + K = np.diag([1, 1, 1]) + + K[0, 2] = width / 2 + K[1, 2] = height / 2 + + K[0, 0] = K[0, 2] / np.tan(xfov) + K[1, 1] = K[1, 2] / np.tan(yfov) + + return rot, trans, K + +def load_all_poses(scan_dir, frame_idxs): + frame_poses = {} + for frame_idx in frame_idxs: + frame_pose = load_pose(scan_dir, frame_idx) + frame_poses[frame_idx] = frame_pose + return frame_poses + +def load_pose(scan_dir, frame_id): + pose_path = osp.join(scan_dir, frame_id, 'camera_pose.txt') + camera_info = np.loadtxt(pose_path) + rgb_image_path = osp.join(scan_dir, frame_id, 'rgb_rawlight.png') + color = cv2.imread(rgb_image_path) + height, width = color.shape[:2] + rot, trans, K = parse_camera_info(camera_info, height, width) + + trans = np.array(trans) / 1000 + extrinsic = np.eye(4) + extrinsic[:3, :3] = rot.T + extrinsic[:3, -1] = trans + extrinsic = np.linalg.inv(extrinsic) + + return extrinsic + +def load_intrinsics(scene_folder): + camera_info = np.loadtxt(osp.join(scene_folder, '0', 'camera_pose.txt')) + rgb_image_path = osp.join(scene_folder, '0', 'rgb_rawlight.png') + rgb_img = cv2.imread(rgb_image_path) + height, width = rgb_img.shape[:2] + _, _, K = parse_camera_info(camera_info, height, width) + intrinsics = {} + intrinsics['intrinsic_mat'] = K + intrinsics['width'] = width + intrinsics['height'] = height + return intrinsics \ No newline at end of file