Skip to content

Commit

Permalink
Batch test automatic script (#3757)
Browse files Browse the repository at this point in the history
* add batch test scripts

* update slurm

* add ar computer

* clean

* add some comments

* fix bug

* minor fix
  • Loading branch information
yhcao6 authored Sep 21, 2020
1 parent 14b5d8e commit a10d24d
Show file tree
Hide file tree
Showing 2 changed files with 231 additions and 0 deletions.
212 changes: 212 additions & 0 deletions .dev_scripts/batch_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
"""
some instructions
1. Fill the models that needs to be checked in the modelzoo_dict
2. Arange the structure of the directory as follows, the script will find the
corresponding config itself:
model_dir/model_family/checkpoints
e.g.: models/faster_rcnn/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth
models/faster_rcnn/faster_rcnn_r101_fpn_1x_coco_20200130-047c8118.pth
3. Excute the batch_test.sh
"""

import argparse
import json
import os
import subprocess

import mmcv
import torch
from mmcv import Config, get_logger
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmcv.runner import get_dist_info, init_dist, load_checkpoint

from mmdet.apis import multi_gpu_test, single_gpu_test
from mmdet.core import wrap_fp16_model
from mmdet.datasets import (build_dataloader, build_dataset,
replace_ImageToTensor)
from mmdet.models import build_detector

modelzoo_dict = {
'configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py': {
'bbox': 0.374
},
'configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py': {
'bbox': 0.382,
'segm': 0.347
},
'configs/rpn/rpn_r50_fpn_1x_coco.py': {
'AR@1000': 0.582
}
}


def parse_args():
parser = argparse.ArgumentParser(
description='The script used for checking the correctness \
of batch inference')
parser.add_argument('model_dir', help='directory of models')
parser.add_argument(
'json_out', help='the output json records test information like mAP')
parser.add_argument(
'--launcher',
choices=['none', 'pytorch', 'slurm', 'mpi'],
default='none',
help='job launcher')
parser.add_argument('--local_rank', type=int, default=0)
args = parser.parse_args()
if 'LOCAL_RANK' not in os.environ:
os.environ['LOCAL_RANK'] = str(args.local_rank)
return args


def check_finish(all_model_dict, result_file):
# check if all models are checked
tested_cfgs = []
with open(result_file, 'r+') as f:
for line in f:
line = json.loads(line)
tested_cfgs.append(line['cfg'])
is_finish = True
for cfg in sorted(all_model_dict.keys()):
if cfg not in tested_cfgs:
return cfg
if is_finish:
with open(result_file, 'a+') as f:
f.write('finished\n')


def dump_dict(record_dict, json_out):
# dump result json dict
with open(json_out, 'a+') as f:
mmcv.dump(record_dict, f, file_format='json')
f.write('\n')


def main():
args = parse_args()
# touch the output json if not exist
with open(args.json_out, 'a+'):
pass
# init distributed env first, since logger depends on the dist
# info.
if args.launcher == 'none':
distributed = False
else:
distributed = True
init_dist(args.launcher, backend='nccl')
rank, world_size = get_dist_info()

logger = get_logger('root')

# read info of checkpoints and config
result_dict = dict()
for model_family_dir in os.listdir(args.model_dir):
for model in os.listdir(
os.path.join(args.model_dir, model_family_dir)):
# cpt: rpn_r50_fpn_1x_coco_20200218-5525fa2e.pth
# cfg: rpn_r50_fpn_1x_coco.py
cfg = model.split('.')[0][:-18] + '.py'
cfg_path = os.path.join('configs', model_family_dir, cfg)
assert os.path.isfile(
cfg_path), f'{cfg_path} is not valid config path'
cpt_path = os.path.join(args.model_dir, model_family_dir, model)
result_dict[cfg_path] = cpt_path
assert cfg_path in modelzoo_dict, f'please fill the ' \
f'performance of cfg: {cfg_path}'
cfg = check_finish(result_dict, args.json_out)
cpt = result_dict[cfg]
try:
cfg_name = cfg
logger.info(f'evaluate {cfg}')
record = dict(cfg=cfg, cpt=cpt)
cfg = Config.fromfile(cfg)
# cfg.data.test.ann_file = 'data/val_0_10.json'
# set cudnn_benchmark
if cfg.get('cudnn_benchmark', False):
torch.backends.cudnn.benchmark = True
cfg.model.pretrained = None
if cfg.model.get('neck'):
if isinstance(cfg.model.neck, list):
for neck_cfg in cfg.model.neck:
if neck_cfg.get('rfp_backbone'):
if neck_cfg.rfp_backbone.get('pretrained'):
neck_cfg.rfp_backbone.pretrained = None
elif cfg.model.neck.get('rfp_backbone'):
if cfg.model.neck.rfp_backbone.get('pretrained'):
cfg.model.neck.rfp_backbone.pretrained = None

# in case the test dataset is concatenated
if isinstance(cfg.data.test, dict):
cfg.data.test.test_mode = True
elif isinstance(cfg.data.test, list):
for ds_cfg in cfg.data.test:
ds_cfg.test_mode = True

# build the dataloader
samples_per_gpu = 2 # hack test with 2 image per gpu
if samples_per_gpu > 1:
# Replace 'ImageToTensor' to 'DefaultFormatBundle'
cfg.data.test.pipeline = replace_ImageToTensor(
cfg.data.test.pipeline)
dataset = build_dataset(cfg.data.test)
data_loader = build_dataloader(
dataset,
samples_per_gpu=samples_per_gpu,
workers_per_gpu=cfg.data.workers_per_gpu,
dist=distributed,
shuffle=False)

# build the model and load checkpoint
model = build_detector(
cfg.model, train_cfg=None, test_cfg=cfg.test_cfg)
fp16_cfg = cfg.get('fp16', None)
if fp16_cfg is not None:
wrap_fp16_model(model)

checkpoint = load_checkpoint(model, cpt, map_location='cpu')
# old versions did not save class info in checkpoints,
# this walkaround is for backward compatibility
if 'CLASSES' in checkpoint['meta']:
model.CLASSES = checkpoint['meta']['CLASSES']
else:
model.CLASSES = dataset.CLASSES

if not distributed:
model = MMDataParallel(model, device_ids=[0])
outputs = single_gpu_test(model, data_loader)
else:
model = MMDistributedDataParallel(
model.cuda(),
device_ids=[torch.cuda.current_device()],
broadcast_buffers=False)
outputs = multi_gpu_test(model, data_loader, 'tmp')
if rank == 0:
ref_mAP_dict = modelzoo_dict[cfg_name]
metrics = list(ref_mAP_dict.keys())
metrics = [
m if m != 'AR@1000' else 'proposal_fast' for m in metrics
]
eval_results = dataset.evaluate(outputs, metrics)
print(eval_results)
for metric in metrics:
if metric == 'proposal_fast':
ref_metric = modelzoo_dict[cfg_name]['AR@1000']
eval_metric = eval_results['AR@1000']
else:
ref_metric = modelzoo_dict[cfg_name][metric]
eval_metric = eval_results[f'{metric}_mAP']
if abs(ref_metric - eval_metric) > 0.003:
record['is_normal'] = False
dump_dict(record, args.json_out)
check_finish(result_dict, args.json_out)
except Exception as e:
logger.error(f'rank: {rank} test fail with error: {e}')
record['terminate'] = True
dump_dict(record, args.json_out)
check_finish(result_dict, args.json_out)
# hack there to throw some error to prevent hang out
subprocess.call('xxx')


if __name__ == '__main__':
main()
19 changes: 19 additions & 0 deletions .dev_scripts/batch_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
export PYTHONPATH=${PWD}

partition=$1
model_dir=$2
json_out=$3
job_name=batch_test
gpus=8
gpu_per_node=8

touch $json_out
lastLine=$(tail -n 1 $json_out)
while [ "$lastLine" != "finished" ]
do
srun -p ${partition} --gres=gpu:${gpu_per_node} -n${gpus} --ntasks-per-node=${gpu_per_node} \
--job-name=${job_name} --kill-on-bad-exit=1 \
python .dev_scripts/batch_test.py $model_dir $json_out --launcher='slurm'
lastLine=$(tail -n 1 $json_out)
echo $lastLine
done

0 comments on commit a10d24d

Please sign in to comment.