-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* add batch test scripts * update slurm * add ar computer * clean * add some comments * fix bug * minor fix
- Loading branch information
Showing
2 changed files
with
231 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,212 @@ | ||
""" | ||
some instructions | ||
1. Fill the models that needs to be checked in the modelzoo_dict | ||
2. Arange the structure of the directory as follows, the script will find the | ||
corresponding config itself: | ||
model_dir/model_family/checkpoints | ||
e.g.: models/faster_rcnn/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth | ||
models/faster_rcnn/faster_rcnn_r101_fpn_1x_coco_20200130-047c8118.pth | ||
3. Excute the batch_test.sh | ||
""" | ||
|
||
import argparse | ||
import json | ||
import os | ||
import subprocess | ||
|
||
import mmcv | ||
import torch | ||
from mmcv import Config, get_logger | ||
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel | ||
from mmcv.runner import get_dist_info, init_dist, load_checkpoint | ||
|
||
from mmdet.apis import multi_gpu_test, single_gpu_test | ||
from mmdet.core import wrap_fp16_model | ||
from mmdet.datasets import (build_dataloader, build_dataset, | ||
replace_ImageToTensor) | ||
from mmdet.models import build_detector | ||
|
||
modelzoo_dict = { | ||
'configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py': { | ||
'bbox': 0.374 | ||
}, | ||
'configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py': { | ||
'bbox': 0.382, | ||
'segm': 0.347 | ||
}, | ||
'configs/rpn/rpn_r50_fpn_1x_coco.py': { | ||
'AR@1000': 0.582 | ||
} | ||
} | ||
|
||
|
||
def parse_args(): | ||
parser = argparse.ArgumentParser( | ||
description='The script used for checking the correctness \ | ||
of batch inference') | ||
parser.add_argument('model_dir', help='directory of models') | ||
parser.add_argument( | ||
'json_out', help='the output json records test information like mAP') | ||
parser.add_argument( | ||
'--launcher', | ||
choices=['none', 'pytorch', 'slurm', 'mpi'], | ||
default='none', | ||
help='job launcher') | ||
parser.add_argument('--local_rank', type=int, default=0) | ||
args = parser.parse_args() | ||
if 'LOCAL_RANK' not in os.environ: | ||
os.environ['LOCAL_RANK'] = str(args.local_rank) | ||
return args | ||
|
||
|
||
def check_finish(all_model_dict, result_file): | ||
# check if all models are checked | ||
tested_cfgs = [] | ||
with open(result_file, 'r+') as f: | ||
for line in f: | ||
line = json.loads(line) | ||
tested_cfgs.append(line['cfg']) | ||
is_finish = True | ||
for cfg in sorted(all_model_dict.keys()): | ||
if cfg not in tested_cfgs: | ||
return cfg | ||
if is_finish: | ||
with open(result_file, 'a+') as f: | ||
f.write('finished\n') | ||
|
||
|
||
def dump_dict(record_dict, json_out): | ||
# dump result json dict | ||
with open(json_out, 'a+') as f: | ||
mmcv.dump(record_dict, f, file_format='json') | ||
f.write('\n') | ||
|
||
|
||
def main(): | ||
args = parse_args() | ||
# touch the output json if not exist | ||
with open(args.json_out, 'a+'): | ||
pass | ||
# init distributed env first, since logger depends on the dist | ||
# info. | ||
if args.launcher == 'none': | ||
distributed = False | ||
else: | ||
distributed = True | ||
init_dist(args.launcher, backend='nccl') | ||
rank, world_size = get_dist_info() | ||
|
||
logger = get_logger('root') | ||
|
||
# read info of checkpoints and config | ||
result_dict = dict() | ||
for model_family_dir in os.listdir(args.model_dir): | ||
for model in os.listdir( | ||
os.path.join(args.model_dir, model_family_dir)): | ||
# cpt: rpn_r50_fpn_1x_coco_20200218-5525fa2e.pth | ||
# cfg: rpn_r50_fpn_1x_coco.py | ||
cfg = model.split('.')[0][:-18] + '.py' | ||
cfg_path = os.path.join('configs', model_family_dir, cfg) | ||
assert os.path.isfile( | ||
cfg_path), f'{cfg_path} is not valid config path' | ||
cpt_path = os.path.join(args.model_dir, model_family_dir, model) | ||
result_dict[cfg_path] = cpt_path | ||
assert cfg_path in modelzoo_dict, f'please fill the ' \ | ||
f'performance of cfg: {cfg_path}' | ||
cfg = check_finish(result_dict, args.json_out) | ||
cpt = result_dict[cfg] | ||
try: | ||
cfg_name = cfg | ||
logger.info(f'evaluate {cfg}') | ||
record = dict(cfg=cfg, cpt=cpt) | ||
cfg = Config.fromfile(cfg) | ||
# cfg.data.test.ann_file = 'data/val_0_10.json' | ||
# set cudnn_benchmark | ||
if cfg.get('cudnn_benchmark', False): | ||
torch.backends.cudnn.benchmark = True | ||
cfg.model.pretrained = None | ||
if cfg.model.get('neck'): | ||
if isinstance(cfg.model.neck, list): | ||
for neck_cfg in cfg.model.neck: | ||
if neck_cfg.get('rfp_backbone'): | ||
if neck_cfg.rfp_backbone.get('pretrained'): | ||
neck_cfg.rfp_backbone.pretrained = None | ||
elif cfg.model.neck.get('rfp_backbone'): | ||
if cfg.model.neck.rfp_backbone.get('pretrained'): | ||
cfg.model.neck.rfp_backbone.pretrained = None | ||
|
||
# in case the test dataset is concatenated | ||
if isinstance(cfg.data.test, dict): | ||
cfg.data.test.test_mode = True | ||
elif isinstance(cfg.data.test, list): | ||
for ds_cfg in cfg.data.test: | ||
ds_cfg.test_mode = True | ||
|
||
# build the dataloader | ||
samples_per_gpu = 2 # hack test with 2 image per gpu | ||
if samples_per_gpu > 1: | ||
# Replace 'ImageToTensor' to 'DefaultFormatBundle' | ||
cfg.data.test.pipeline = replace_ImageToTensor( | ||
cfg.data.test.pipeline) | ||
dataset = build_dataset(cfg.data.test) | ||
data_loader = build_dataloader( | ||
dataset, | ||
samples_per_gpu=samples_per_gpu, | ||
workers_per_gpu=cfg.data.workers_per_gpu, | ||
dist=distributed, | ||
shuffle=False) | ||
|
||
# build the model and load checkpoint | ||
model = build_detector( | ||
cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) | ||
fp16_cfg = cfg.get('fp16', None) | ||
if fp16_cfg is not None: | ||
wrap_fp16_model(model) | ||
|
||
checkpoint = load_checkpoint(model, cpt, map_location='cpu') | ||
# old versions did not save class info in checkpoints, | ||
# this walkaround is for backward compatibility | ||
if 'CLASSES' in checkpoint['meta']: | ||
model.CLASSES = checkpoint['meta']['CLASSES'] | ||
else: | ||
model.CLASSES = dataset.CLASSES | ||
|
||
if not distributed: | ||
model = MMDataParallel(model, device_ids=[0]) | ||
outputs = single_gpu_test(model, data_loader) | ||
else: | ||
model = MMDistributedDataParallel( | ||
model.cuda(), | ||
device_ids=[torch.cuda.current_device()], | ||
broadcast_buffers=False) | ||
outputs = multi_gpu_test(model, data_loader, 'tmp') | ||
if rank == 0: | ||
ref_mAP_dict = modelzoo_dict[cfg_name] | ||
metrics = list(ref_mAP_dict.keys()) | ||
metrics = [ | ||
m if m != 'AR@1000' else 'proposal_fast' for m in metrics | ||
] | ||
eval_results = dataset.evaluate(outputs, metrics) | ||
print(eval_results) | ||
for metric in metrics: | ||
if metric == 'proposal_fast': | ||
ref_metric = modelzoo_dict[cfg_name]['AR@1000'] | ||
eval_metric = eval_results['AR@1000'] | ||
else: | ||
ref_metric = modelzoo_dict[cfg_name][metric] | ||
eval_metric = eval_results[f'{metric}_mAP'] | ||
if abs(ref_metric - eval_metric) > 0.003: | ||
record['is_normal'] = False | ||
dump_dict(record, args.json_out) | ||
check_finish(result_dict, args.json_out) | ||
except Exception as e: | ||
logger.error(f'rank: {rank} test fail with error: {e}') | ||
record['terminate'] = True | ||
dump_dict(record, args.json_out) | ||
check_finish(result_dict, args.json_out) | ||
# hack there to throw some error to prevent hang out | ||
subprocess.call('xxx') | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
export PYTHONPATH=${PWD} | ||
|
||
partition=$1 | ||
model_dir=$2 | ||
json_out=$3 | ||
job_name=batch_test | ||
gpus=8 | ||
gpu_per_node=8 | ||
|
||
touch $json_out | ||
lastLine=$(tail -n 1 $json_out) | ||
while [ "$lastLine" != "finished" ] | ||
do | ||
srun -p ${partition} --gres=gpu:${gpu_per_node} -n${gpus} --ntasks-per-node=${gpu_per_node} \ | ||
--job-name=${job_name} --kill-on-bad-exit=1 \ | ||
python .dev_scripts/batch_test.py $model_dir $json_out --launcher='slurm' | ||
lastLine=$(tail -n 1 $json_out) | ||
echo $lastLine | ||
done |