Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
  • Loading branch information
YvanYin committed Mar 28, 2020
1 parent 329665f commit 1ff9348
Show file tree
Hide file tree
Showing 42 changed files with 3,797 additions and 2 deletions.
27 changes: 27 additions & 0 deletions Installation.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
## Installation

###Requirements
- PyTorch = 1.1.0
- torchvision == 0.2.1
- matplotlib
- opencv-python
- dill
- scipy
- yaml

### Step-by-step installation
```bash
# Firstly, your conda is setup properly with the right environment for that

conda create --n VNL python=3.6
conda activate VNL


# basic packages
conda install matplotlib dill pyyaml opencv scipy

# follow PyTorch installation in https://pytorch.org/get-started/locally/
# we give the instructions for CUDA 9.0
conda install -c pytorch torchvision=0.2.1 cudatoolkit=9.0

```
65 changes: 63 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,69 @@
#### DiverseDepth: Affine-invariant Depth Prediction Using Diverse Data.

The code and dataset will be released.
This repository contains the source code of our paper:
[Wei Yin, Xinlong Wang, Chunhua Shen, Yifan Liu, Zhi Tian, Songcen Xu, Changming Sun, DiverseDepth: Affine-invariant Depth Prediction Using Diverse Data](https://arxiv.org/abs/2002.00569).

## Some Results

![Any images online](./examples/any_imgs.jpg)
![Point cloud](./examples/pcd.png)

## Some Dataset Examples
![Dataset](./examples/dataset_examples.png)


****
## Hightlights
- **Generalization:** We have tested on several zero-shot datasets to test the generalization of our method.



****
## Installation
- Please refer to [Installation](./Installation.md).

## Datasets
We collect multiply source data to construct our DiverseDepth dataset, including crawling online stereoscopic images, images from DIML and Taskonomy. These three parts form the foreground parts (Part-fore), outdoor scenes (Part-out) and indoor scenes (Part-in) of our dataset.
The size of three parts are:
Part-in: contains 93838 images
Part-out: contains 120293 images
Part-fore: contains 109703 images
We will release the dataset as soon as possible.

## Model Zoo
- ResNext50_32x4d backbone, trained on DiverseDepth dataset, download [here](https://cloudstor.aarnet.edu.au/plus/s/ixWf3nTJFZ0YE4q)



## Inference

```bash
# Run the inferece on NYUDV2 dataset
python ./tools/test_diversedepth_nyu.py \
--dataroot ./datasets/NYUDV2 \
--dataset nyudv2 \
--cfg_file lib/configs/resnext50_32x4d_diversedepth_regression_vircam \
--load_ckpt ./model.pth

# Test depth predictions on any images, please replace the data dir in test_any_images.py
python ./tools/test_any_diversedepth.py \
--dataroot ./ \
--dataset any \
--cfg_file lib/configs/resnext50_32x4d_diversedepth_regression_vircam \
--load_ckpt ./model.pth
```
If you want to test the kitti dataset, please see [here](./datasets/KITTI/README.md)



### Citation
```
@inproceedings{Yin2019enforcing,
title={Enforcing geometric constraints of virtual normal for depth prediction},
author={Wei Yin, Xinlong Wang, Chunhua Shen, Yifan Liu, Zhi Tian, Songcen Xu, Changming Sun},
booktitle= {arxiv: 2002.00569},
year={2020}
}
```
### Contact
Wei Yin: [email protected]

1 change: 1 addition & 0 deletions data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

22 changes: 22 additions & 0 deletions data/any_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import cv2
import json
import torch
import os.path
import numpy as np
import scipy.io as sio
from lib.core.config import cfg
import torchvision.transforms as transforms
from lib.utils.logging import setup_logging

logger = setup_logging(__name__)


class ANYDataset():
def initialize(self, opt):
self.data_size = 0

def __len__(self):
return self.data_size

def name(self):
return 'ANY'
218 changes: 218 additions & 0 deletions data/kitti_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
import cv2
import json
import torch
import os.path
import numpy as np
from lib.core.config import cfg
import torchvision.transforms as transforms
from lib.utils.logging import setup_logging

logger = setup_logging(__name__)


class KITTIDataset():
def initialize(self, opt):
self.opt = opt
self.root = opt.dataroot
self.dir_anno = os.path.join(cfg.ROOT_DIR, opt.dataroot, 'annotations', opt.phase_anno + '_annotations.json')
self.A_paths, self.B_paths = self.getData()
self.data_size = len(self.A_paths)
self.depth_normalize = 255. * 80.
self.uniform_size = (385, 1243)

def getData(self):
with open(self.dir_anno, 'r') as load_f:
AB_anno = json.load(load_f)
A_list = [os.path.join(cfg.ROOT_DIR, self.opt.dataroot, AB_anno[i]['rgb_path']) for i in range(len(AB_anno))]
B_list = [os.path.join(cfg.ROOT_DIR, self.opt.dataroot, AB_anno[i]['depth_path']) for i in range(len(AB_anno))]
logger.info('Loaded Kitti data!')
return A_list, B_list

def __getitem__(self, anno_index):
if 'train' in self.opt.phase:
data = self.online_aug_train(anno_index)
else:
data = self.online_aug_val_test(anno_index)
return data

def online_aug_train(self, idx):
"""
Augment data for training online randomly. The invalid parts in the depth map are set to -1.0, while the parts
in depth bins are set to cfg.MODEL.DECODER_OUTPUT_C + 1.
:param idx: data index.
"""
A_path = self.A_paths[idx]
B_path = self.B_paths[idx]

A = cv2.imread(A_path, -1) # [H, W, C] C:bgr
B = cv2.imread(B_path, -1) / self.depth_normalize #[0.0, 1.0]

flip_flg, resize_size, crop_size, pad, resize_ratio = self.set_flip_pad_reshape_crop(A)

A_crop = self.flip_pad_reshape_crop(A, flip_flg, resize_size, crop_size, pad, 128)
B_crop = self.flip_pad_reshape_crop(B, flip_flg, resize_size, crop_size, pad, -1)

A_crop = A_crop.transpose((2, 0, 1))
B_crop = B_crop[np.newaxis, :, :]

# change the color channel, bgr->rgb
A_crop = A_crop[::-1, :, :]

# to torch, normalize
A_crop = self.scale_torch(A_crop, 255.)
B_crop = self.scale_torch(B_crop, resize_ratio)

B_classes = self.depth_to_class(B_crop)

invalid_side = [0, 0, 0, 0] if crop_size[1] != 0 else [int((pad[0] + 50)*resize_ratio), 0, 0, 0]

A = np.pad(A, ((pad[0], pad[1]), (pad[2], pad[3]), (0, 0)), 'constant', constant_values=(0, 0))
B = np.pad(B, ((pad[0], pad[1]), (pad[2], pad[3])), 'constant', constant_values=(0, 0))

data = {'A': A_crop, 'B': B_crop, 'A_raw': A, 'B_raw': B, 'B_classes': B_classes, 'A_paths': A_path,
'B_paths': B_path, 'invalid_side': np.array(invalid_side), 'pad_raw': np.array(pad)}
return data

def online_aug_val_test(self, idx):
A_path = self.A_paths[idx]
B_path = self.B_paths[idx]

A = cv2.imread(A_path, -1) # [H, W, C] C:bgr

B = cv2.imread(B_path, 0) / self.depth_normalize # [0.0, 1.0]

flip_flg, resize_size, crop_size, pad, resize_ratio = self.set_flip_pad_reshape_crop(A)

crop_size_l = [pad[2], 0, cfg.DATASET.CROP_SIZE[1], cfg.DATASET.CROP_SIZE[0]]
crop_size_m = [cfg.DATASET.CROP_SIZE[1] + pad[2] - 20, 0, cfg.DATASET.CROP_SIZE[1], cfg.DATASET.CROP_SIZE[0]]
crop_size_r = [self.uniform_size[1] - cfg.DATASET.CROP_SIZE[1], 0, cfg.DATASET.CROP_SIZE[1], cfg.DATASET.CROP_SIZE[0]]

A_crop_l = self.flip_pad_reshape_crop(A, flip_flg, resize_size, crop_size_l, pad, 128)
A_crop_l = A_crop_l.transpose((2, 0, 1))
A_crop_l = A_crop_l[::-1, :, :]

A_crop_m = self.flip_pad_reshape_crop(A, flip_flg, resize_size, crop_size_m, pad, 128)
A_crop_m = A_crop_m.transpose((2, 0, 1))
A_crop_m = A_crop_m[::-1, :, :]

A_crop_r = self.flip_pad_reshape_crop(A, flip_flg, resize_size, crop_size_r, pad, 128)
A_crop_r = A_crop_r.transpose((2, 0, 1))
A_crop_r = A_crop_r[::-1, :, :]

A_crop_l = self.scale_torch(A_crop_l, 255.)
A_crop_m = self.scale_torch(A_crop_m, 255.)
A_crop_r = self.scale_torch(A_crop_r, 255.)
A_pad = np.pad(A, ((pad[0], pad[1]), (pad[2], pad[3]), (0, 0)), 'constant', constant_values=(0, 0))
B_pad = np.pad(B, ((pad[0], pad[1]), (pad[2], pad[3])), 'constant', constant_values=(0, 0))
crop_lmr = np.array((crop_size_l, crop_size_m, crop_size_r))

A_out = A_pad.transpose((2, 0, 1))
B_out = B_pad[np.newaxis, :, :]
# change the color channel, bgr->rgb
A_out = A_out[::-1, :, :]
# to torch, normalize
A_out = self.scale_torch(A_out, 255.)
B_out = self.scale_torch(B_out, 1.0)
invalid_side = pad
data = {'A': A_out, 'B': B_out,'A_l': A_crop_l, 'A_m': A_crop_m, 'A_r': A_crop_r,
'A_raw': A_pad, 'B_raw': B_pad, 'A_paths': A_path, 'B_paths': B_path, 'pad_raw': np.array(pad), 'crop_lmr': crop_lmr}
return data

def set_flip_pad_reshape_crop(self, A):
"""
Set flip, padding, reshaping and cropping flags.
:param A: Input image, [H, W, C]
:return: Data augamentation parameters
"""
# flip
flip_prob = np.random.uniform(0.0, 1.0)
flip_flg = True if flip_prob > 0.5 and 'train' in self.opt.phase else False

# pad
pad_height = self.uniform_size[0] - A.shape[0]
pad_width = self.uniform_size[1] - A.shape[1]
pad = [pad_height, 0, pad_width, 0] #[up, down, left, right]

# reshape
ratio_list = [1.0, 1.2, 1.5, 1.8, 2.0]#
resize_ratio = ratio_list[np.random.randint(len(ratio_list))] if 'train' in self.opt.phase else 1.0
resize_size = [int((A.shape[0]+pad[0]+pad[1]) * resize_ratio + 0.5),
int((A.shape[1]+pad[2]+pad[3]) * resize_ratio + 0.5)]

# crop
start_y = 0 if resize_size[0] < (50 + pad[0] + pad[1]) * resize_ratio + cfg.DATASET.CROP_SIZE[0]\
else np.random.randint(int((50 + pad[0]) * resize_ratio), resize_size[0] - cfg.DATASET.CROP_SIZE[0] - pad[1] * resize_ratio)
start_x = np.random.randint(pad[2] * resize_ratio, resize_size[1] - cfg.DATASET.CROP_SIZE[1] - pad[3] * resize_ratio)
crop_height = cfg.DATASET.CROP_SIZE[0]
crop_width = cfg.DATASET.CROP_SIZE[1]
crop_size = [start_x, start_y, crop_width, crop_height]
return flip_flg, resize_size, crop_size, pad, resize_ratio

def flip_pad_reshape_crop(self, img, flip, resize_size, crop_size, pad, pad_value=0):
"""
Preprocessing input image or ground truth depth.
:param img: RGB image or depth image
:param flip: Flipping flag, True or False
:param resize_size: Resizing size
:param crop_size: Cropping size
:param pad: Padding region
:param pad_value: Padding value
:return: Processed image
"""
if len(img.shape) == 1:
return img
# Flip
if flip:
img = np.flip(img, axis=1)

# Pad the raw image
if len(img.shape) == 3:
img_pad = np.pad(img, ((pad[0], pad[1]), (pad[2], pad[3]), (0, 0)), 'constant',
constant_values=(pad_value, pad_value))
else:
img_pad = np.pad(img, ((pad[0], pad[1]), (pad[2], pad[3])), 'constant',
constant_values=(pad_value, pad_value))
# Resize the raw image
img_resize = cv2.resize(img_pad, (resize_size[1], resize_size[0]), interpolation=cv2.INTER_LINEAR)
# Crop the resized image
img_crop = img_resize[crop_size[1]:crop_size[1] + crop_size[3], crop_size[0]:crop_size[0] + crop_size[2]]

return img_crop

def depth_to_bins(self, depth):
"""
Discretize depth into depth bins
Mark invalid padding area as cfg.MODEL.DECODER_OUTPUT_C + 1
:param depth: 1-channel depth, [1, h, w]
:return: depth bins [1, h, w]
"""
invalid_mask = depth < 0.
depth[depth < cfg.DATASET.DEPTH_MIN] = cfg.DATASET.DEPTH_MIN
depth[depth > cfg.DATASET.DEPTH_MAX] = cfg.DATASET.DEPTH_MAX
bins = ((torch.log10(depth) - cfg.DATASET.DEPTH_MIN_LOG) / cfg.DATASET.DEPTH_BIN_INTERVAL).to(torch.int)
bins[invalid_mask] = cfg.MODEL.DECODER_OUTPUT_C + 1
bins[bins == cfg.MODEL.DECODER_OUTPUT_C] = cfg.MODEL.DECODER_OUTPUT_C - 1
depth[invalid_mask] = -1.0
return bins

def scale_torch(self, img, scale):
"""
Scale the image and output it in torch.tensor.
:param img: input image. [C, H, W]
:param scale: the scale factor. float
:return: img. [C, H, W
"""
img = img.astype(np.float32)
img /= scale
img = torch.from_numpy(img.copy())
if img.size(0) == 3:
img = transforms.Normalize(cfg.DATASET.RGB_PIXEL_MEANS, cfg.DATASET.RGB_PIXEL_VARS)(img)
else:
img = transforms.Normalize((0,), (1,))(img)
return img

def __len__(self):
return self.data_size

def name(self):
return 'KITTI'
Loading

0 comments on commit 1ff9348

Please sign in to comment.