From af4b813091678c2ca4afbe5c440006d73b65f2aa Mon Sep 17 00:00:00 2001 From: zhuangpeiqin Date: Wed, 7 Jul 2021 20:36:38 +0800 Subject: [PATCH 1/4] add persisten_worker flag in when building dataloader --- mmaction/datasets/builder.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mmaction/datasets/builder.py b/mmaction/datasets/builder.py index 3ab7997..26b1f51 100644 --- a/mmaction/datasets/builder.py +++ b/mmaction/datasets/builder.py @@ -93,6 +93,9 @@ def build_dataloader(dataset, worker_init_fn, num_workers=num_workers, rank=rank, seed=seed) if seed is not None else None + persistent_workers = True if num_workers != 0 else False + # recommend to use this flag to keep workers when constructing the iterator. + #However, this flag is only available when the version of pytorch is larger than 1.9. data_loader = DataLoader( dataset, batch_size=batch_size, @@ -103,6 +106,7 @@ def build_dataloader(dataset, shuffle=shuffle, worker_init_fn=init_fn, drop_last=drop_last, + persistent_workers=persistent_workers **kwargs) return data_loader From cb552dac1cafb46983e73cd61e078e492b2041cd Mon Sep 17 00:00:00 2001 From: PeiqinZhuang Date: Wed, 7 Jul 2021 20:40:07 +0800 Subject: [PATCH 2/4] Update builder.py --- mmaction/datasets/builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mmaction/datasets/builder.py b/mmaction/datasets/builder.py index 26b1f51..3c93321 100644 --- a/mmaction/datasets/builder.py +++ b/mmaction/datasets/builder.py @@ -106,7 +106,7 @@ def build_dataloader(dataset, shuffle=shuffle, worker_init_fn=init_fn, drop_last=drop_last, - persistent_workers=persistent_workers + persistent_workers=persistent_workers, **kwargs) return data_loader From 5963d2203e66186d6d7680c593978b72cf8a420e Mon Sep 17 00:00:00 2001 From: PeiqinZhuang Date: Tue, 26 Oct 2021 21:45:36 +0800 Subject: [PATCH 3/4] accelerate the reading speed of Kinetics --- tools/train_semi.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/train_semi.py b/tools/train_semi.py index d5d333b..b09de9d 100644 --- a/tools/train_semi.py +++ b/tools/train_semi.py @@ -18,6 +18,11 @@ # Custom imports from mmaction.apis import train_model_semi +#Set the number of thread as 0, when reading images with large size, e.g. Kinetics with short side of 320, +#This strategy is helpful when cpu is busy. (3X faster) +import cv2 +cv2.setNumberThreads(0) + def parse_args(): parser = argparse.ArgumentParser(description='Train a recognizer') From 00e62e916f8c3d8bb6d6af325a623d0e275d32ee Mon Sep 17 00:00:00 2001 From: PeiqinZhuang Date: Tue, 26 Oct 2021 21:48:23 +0800 Subject: [PATCH 4/4] fix the inaccurate performance evaluation during training process --- mmaction/core/evaluation/eval_hooks.py | 557 +++++++++++++++++-------- 1 file changed, 380 insertions(+), 177 deletions(-) diff --git a/mmaction/core/evaluation/eval_hooks.py b/mmaction/core/evaluation/eval_hooks.py index d4fc48e..c870054 100644 --- a/mmaction/core/evaluation/eval_hooks.py +++ b/mmaction/core/evaluation/eval_hooks.py @@ -1,188 +1,391 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os import os.path as osp +import warnings from math import inf -import mmcv -from mmcv.runner import Hook +import torch.distributed as dist +from torch.nn.modules.batchnorm import _BatchNorm from torch.utils.data import DataLoader -from mmaction.utils import get_root_logger - - -class EvalHook(Hook): - """Non-Distributed evaluation hook. - - This hook will regularly perform evaluation in a given interval when - performing in non-distributed environment. - - Args: - dataloader (DataLoader): A PyTorch dataloader. - interval (int): Evaluation interval (by epochs). Default: 1. - gpu_collect (bool): Whether to use gpu or cpu to collect results. - Default: False. - save_best (bool): Whether to save best checkpoint during evaluation. - Default: True. - key_indicator (str | None): Key indicator to measure the best - checkpoint during evaluation when ``save_best`` is set to True. - Options are the evaluation metrics to the test dataset. e.g., - ``top1_acc``, ``top5_acc``, ``mean_class_accuracy``, - ``mean_average_precision`` for action recognition dataset - (RawframeDataset and VideoDataset). ``AR@AN``, ``auc`` for action - localization dataset (ActivityNetDataset). Default: `top1_acc`. - rule (str | None): Comparison rule for best score. If set to None, - it will infer a reasonable rule. Default: 'None'. - eval_kwargs (dict, optional): Arguments for evaluation. - """ - - rule_map = {'greater': lambda x, y: x > y, 'less': lambda x, y: x < y} - init_value_map = {'greater': -inf, 'less': inf} - greater_keys = ['acc', 'top', 'AR@', 'auc'] - less_keys = ['loss'] - - def __init__(self, - dataloader, - interval=1, - gpu_collect=False, - save_best=True, - key_indicator='top1_acc', - rule=None, - **eval_kwargs): - if not isinstance(dataloader, DataLoader): - raise TypeError(f'dataloader must be a pytorch DataLoader, ' - f'but got {type(dataloader)}') - if save_best and not key_indicator: - raise ValueError('key_indicator should not be None, when ' - 'save_best is set to True.') - if rule not in self.rule_map and rule is not None: - raise KeyError(f'rule must be greater, less or None, ' - f'but got {rule}.') - - if rule is None and save_best: - if any(key in key_indicator for key in self.greater_keys): - rule = 'greater' - elif any(key in key_indicator for key in self.less_keys): - rule = 'less' - else: - raise ValueError( - f'key_indicator must be in {self.greater_keys} ' - f'or in {self.less_keys} when rule is None, ' - f'but got {key_indicator}') - - self.dataloader = dataloader - self.interval = interval - self.gpu_collect = gpu_collect - self.eval_kwargs = eval_kwargs - self.save_best = save_best - self.key_indicator = key_indicator - self.rule = rule - - self.logger = get_root_logger() - - if self.save_best: - self.compare_func = self.rule_map[self.rule] - self.best_score = self.init_value_map[self.rule] - - self.best_json = dict() - - def after_train_epoch(self, runner): - """Called after every training epoch to evaluate the results.""" - if not self.every_n_epochs(runner, self.interval): - return - - current_ckpt_path = osp.join(runner.work_dir, - f'epoch_{runner.epoch + 1}.pth') - json_path = osp.join(runner.work_dir, 'best.json') - - if osp.exists(json_path) and len(self.best_json) == 0: - self.best_json = mmcv.load(json_path) - self.best_score = self.best_json['best_score'] - self.best_ckpt = self.best_json['best_ckpt'] - self.key_indicator = self.best_json['key_indicator'] - - from mmaction.apis import single_gpu_test - results = single_gpu_test(runner.model, self.dataloader) - key_score = self.evaluate(runner, results) - if (self.save_best and self.compare_func(key_score, self.best_score)): - self.best_score = key_score - self.logger.info( - f'Now best checkpoint is epoch_{runner.epoch + 1}.pth') - self.best_json['best_score'] = self.best_score - self.best_json['best_ckpt'] = current_ckpt_path - self.best_json['key_indicator'] = self.key_indicator - mmcv.dump(self.best_json, json_path) - - def evaluate(self, runner, results): - """Evaluate the results. +try: + from mmcv.runner import EvalHook as BasicEvalHook + from mmcv.runner import DistEvalHook as BasicDistEvalHook + + from_mmcv = True + + class EvalHook(BasicEvalHook): + greater_keys = [ + 'acc', 'top', 'AR@', 'auc', 'precision', 'mAP@', 'Recall@' + ] + less_keys = ['loss'] + + def __init__(self, *args, save_best='auto', **kwargs): + super().__init__(*args, save_best=save_best, **kwargs) + + class DistEvalHook(BasicDistEvalHook): + greater_keys = [ + 'acc', 'top', 'AR@', 'auc', 'precision', 'mAP@', 'Recall@' + ] + less_keys = ['loss'] + + def __init__(self, *args, save_best='auto', **kwargs): + super().__init__(*args, save_best=save_best, **kwargs) + +except (ImportError, ModuleNotFoundError): + warnings.warn('DeprecationWarning: EvalHook and DistEvalHook in mmaction2 ' + 'will be deprecated, please install mmcv through master ' + 'branch.') + from_mmcv = False + +if not from_mmcv: + + from mmcv.runner import Hook + + class EvalHook(Hook): # noqa: F811 + """Non-Distributed evaluation hook. + + Notes: + If new arguments are added for EvalHook, tools/test.py, + tools/eval_metric.py may be effected. + + This hook will regularly perform evaluation in a given interval when + performing in non-distributed environment. Args: - runner (:obj:`mmcv.Runner`): The underlined training runner. - results (list): Output results. + dataloader (DataLoader): A PyTorch dataloader. + start (int | None, optional): Evaluation starting epoch. It enables + evaluation before the training starts if ``start`` <= the + resuming epoch. If None, whether to evaluate is merely decided + by ``interval``. Default: None. + interval (int): Evaluation interval. Default: 1. + by_epoch (bool): Determine perform evaluation by epoch or by + iteration. If set to True, it will perform by epoch. + Otherwise, by iteration. default: True. + save_best (str | None, optional): If a metric is specified, it + would measure the best checkpoint during evaluation. The + information about best checkpoint would be save in best.json. + Options are the evaluation metrics to the test dataset. e.g., + ``top1_acc``, ``top5_acc``, ``mean_class_accuracy``, + ``mean_average_precision``, ``mmit_mean_average_precision`` + for action recognition dataset (RawframeDataset and + VideoDataset). ``AR@AN``, ``auc`` for action localization + dataset. (ActivityNetDataset). ``mAP@0.5IOU`` for + spatio-temporal action detection dataset (AVADataset). + If ``save_best`` is ``auto``, the first key of the returned + ``OrderedDict`` result will be used. Default: 'auto'. + rule (str | None, optional): Comparison rule for best score. + If set to None, it will infer a reasonable rule. Keys such as + 'acc', 'top' .etc will be inferred by 'greater' rule. Keys + contain 'loss' will be inferred by 'less' rule. Options are + 'greater', 'less', None. Default: None. + **eval_kwargs: Evaluation arguments fed into the evaluate function + of the dataset. """ - eval_res = self.dataloader.dataset.evaluate( - results, logger=runner.logger, **self.eval_kwargs) - for name, val in eval_res.items(): - runner.log_buffer.output[name] = val - runner.log_buffer.ready = True - if self.key_indicator is not None: - return eval_res[self.key_indicator] - else: - return None + rule_map = {'greater': lambda x, y: x > y, 'less': lambda x, y: x < y} + init_value_map = {'greater': -inf, 'less': inf} + greater_keys = [ + 'acc', 'top', 'AR@', 'auc', 'precision', 'mAP@', 'Recall@' + ] + less_keys = ['loss'] + + def __init__(self, + dataloader, + start=None, + interval=1, + by_epoch=True, + save_best='auto', + rule=None, + **eval_kwargs): + + if 'key_indicator' in eval_kwargs: + raise RuntimeError( + '"key_indicator" is deprecated, ' + 'you need to use "save_best" instead. ' + 'See https://github.com/open-mmlab/mmaction2/pull/395 ' + 'for more info') + + if not isinstance(dataloader, DataLoader): + raise TypeError(f'dataloader must be a pytorch DataLoader, ' + f'but got {type(dataloader)}') + + if interval <= 0: + raise ValueError( + f'interval must be positive, but got {interval}') + + assert isinstance(by_epoch, bool) + + if start is not None and start < 0: + warnings.warn( + f'The evaluation start epoch {start} is smaller than 0, ' + f'use 0 instead', UserWarning) + start = 0 + self.dataloader = dataloader + self.interval = interval + self.start = start + self.by_epoch = by_epoch + + assert isinstance(save_best, str) or save_best is None + self.save_best = save_best + self.eval_kwargs = eval_kwargs + self.initial_flag = True + + if self.save_best is not None: + self.best_ckpt_path = None + self._init_rule(rule, self.save_best) + + def _init_rule(self, rule, key_indicator): + """Initialize rule, key_indicator, comparison_func, and best score. + + Args: + rule (str | None): Comparison rule for best score. + key_indicator (str | None): Key indicator to determine the + comparison rule. + """ + if rule not in self.rule_map and rule is not None: + raise KeyError(f'rule must be greater, less or None, ' + f'but got {rule}.') + + if rule is None: + if key_indicator != 'auto': + if any(key in key_indicator for key in self.greater_keys): + rule = 'greater' + elif any(key in key_indicator for key in self.less_keys): + rule = 'less' + else: + raise ValueError( + f'Cannot infer the rule for key ' + f'{key_indicator}, thus a specific rule ' + f'must be specified.') + self.rule = rule + self.key_indicator = key_indicator + if self.rule is not None: + self.compare_func = self.rule_map[self.rule] + + def before_run(self, runner): + if self.save_best is not None: + if runner.meta is None: + warnings.warn('runner.meta is None. Creating a empty one.') + runner.meta = dict() + runner.meta.setdefault('hook_msgs', dict()) + + def before_train_iter(self, runner): + """Evaluate the model only at the start of training by + iteration.""" + if self.by_epoch: + return + if not self.initial_flag: + return + if self.start is not None and runner.iter >= self.start: + self.after_train_iter(runner) + self.initial_flag = False + + def before_train_epoch(self, runner): + """Evaluate the model only at the start of training by epoch.""" + if not self.by_epoch: + return + if not self.initial_flag: + return + if self.start is not None and runner.epoch >= self.start: + self.after_train_epoch(runner) + self.initial_flag = False -class DistEvalHook(EvalHook): - """Distributed evaluation hook. - - This hook will regularly perform evaluation in a given interval when - performing in distributed environment. - - Args: - dataloader (DataLoader): A PyTorch dataloader. - interval (int): Evaluation interval (by epochs). Default: 1. - gpu_collect (bool): Whether to use gpu or cpu to collect results. - Default: False. - save_best (bool): Whether to save best checkpoint during evaluation. - Default: True. - key_indicator (str | None): Key indicator to measure the best - checkpoint during evaluation when ``save_best`` is set to True. - Options are the evaluation metrics to the test dataset. e.g., - ``top1_acc``, ``top5_acc``, ``mean_class_accuracy``, - ``mean_average_precision`` for action recognition dataset - (RawframeDataset and VideoDataset). ``AR@AN``, ``auc`` for action - localization dataset (ActivityNetDataset). Default: `top1_acc`. - rule (str | None): Comparison rule for best score. If set to None, - it will infer a reasonable rule. Default: 'None'. - eval_kwargs (dict, optional): Arguments for evaluation. - """ - - def after_train_epoch(self, runner): - """Called after each training epoch to evaluate the model.""" - if not self.every_n_epochs(runner, self.interval): - return - - current_ckpt_path = osp.join(runner.work_dir, - f'epoch_{runner.epoch + 1}.pth') - json_path = osp.join(runner.work_dir, 'best.json') - - if osp.exists(json_path) and len(self.best_json) == 0: - self.best_json = mmcv.load(json_path) - self.best_score = self.best_json['best_score'] - self.best_ckpt = self.best_json['best_ckpt'] - self.key_indicator = self.best_json['key_indicator'] - - from mmaction.apis import multi_gpu_test - results = multi_gpu_test( - runner.model, - self.dataloader, - tmpdir=osp.join(runner.work_dir, '.eval_hook'), - gpu_collect=self.gpu_collect) - if runner.rank == 0: - print('\n') + def after_train_iter(self, runner): + """Called after every training iter to evaluate the results.""" + if not self.by_epoch: + self._do_evaluate(runner) + + def after_train_epoch(self, runner): + """Called after every training epoch to evaluate the results.""" + if self.by_epoch: + self._do_evaluate(runner) + + def _do_evaluate(self, runner): + """perform evaluation and save ckpt.""" + if not self.evaluation_flag(runner): + return + + from mmaction.apis import single_gpu_test + results = single_gpu_test(runner.model, self.dataloader) key_score = self.evaluate(runner, results) - if (self.save_best - and self.compare_func(key_score, self.best_score)): - self.best_score = key_score - self.logger.info( - f'Now best checkpoint is epoch_{runner.epoch + 1}.pth') - self.best_json['best_score'] = self.best_score - self.best_json['best_ckpt'] = current_ckpt_path - self.best_json['key_indicator'] = self.key_indicator - mmcv.dump(self.best_json, json_path) + if self.save_best: + self._save_ckpt(runner, key_score) + + def evaluation_flag(self, runner): + """Judge whether to perform_evaluation. + + Returns: + bool: The flag indicating whether to perform evaluation. + """ + if self.by_epoch: + current = runner.epoch + check_time = self.every_n_epochs + else: + current = runner.iter + check_time = self.every_n_iters + + if self.start is None: + if not check_time(runner, self.interval): + # No evaluation during the interval. + return False + elif (current + 1) < self.start: + # No evaluation if start is larger than the current time. + return False + else: + # Evaluation only at epochs/iters 3, 5, 7... + # if start==3 and interval==2 + if (current + 1 - self.start) % self.interval: + return False + return True + + def _save_ckpt(self, runner, key_score): + if self.by_epoch: + current = f'epoch_{runner.epoch + 1}' + cur_type, cur_time = 'epoch', runner.epoch + 1 + else: + current = f'iter_{runner.iter + 1}' + cur_type, cur_time = 'iter', runner.iter + 1 + + best_score = runner.meta['hook_msgs'].get( + 'best_score', self.init_value_map[self.rule]) + if self.compare_func(key_score, best_score): + best_score = key_score + runner.meta['hook_msgs']['best_score'] = best_score + + if self.best_ckpt_path and osp.isfile(self.best_ckpt_path): + os.remove(self.best_ckpt_path) + + best_ckpt_name = f'best_{self.key_indicator}_{current}.pth' + runner.save_checkpoint( + runner.work_dir, best_ckpt_name, create_symlink=False) + self.best_ckpt_path = osp.join(runner.work_dir, best_ckpt_name) + + runner.meta['hook_msgs']['best_ckpt'] = self.best_ckpt_path + runner.logger.info( + f'Now best checkpoint is saved as {best_ckpt_name}.') + runner.logger.info( + f'Best {self.key_indicator} is {best_score:0.4f} ' + f'at {cur_time} {cur_type}.') + + def evaluate(self, runner, results): + """Evaluate the results. + + Args: + runner (:obj:`mmcv.Runner`): The underlined training runner. + results (list): Output results. + """ + eval_res = self.dataloader.dataset.evaluate( + results, logger=runner.logger, **self.eval_kwargs) + for name, val in eval_res.items(): + runner.log_buffer.output[name] = val + runner.log_buffer.ready = True + if self.save_best is not None: + if self.key_indicator == 'auto': + # infer from eval_results + self._init_rule(self.rule, list(eval_res.keys())[0]) + return eval_res[self.key_indicator] + + return None + + class DistEvalHook(EvalHook): # noqa: F811 + """Distributed evaluation hook. + + This hook will regularly perform evaluation in a given interval when + performing in distributed environment. + + Args: + dataloader (DataLoader): A PyTorch dataloader. + start (int | None, optional): Evaluation starting epoch. It enables + evaluation before the training starts if ``start`` <= the + resuming epoch. If None, whether to evaluate is merely decided + by ``interval``. Default: None. + interval (int): Evaluation interval. Default: 1. + by_epoch (bool): Determine perform evaluation by epoch or by + iteration. If set to True, it will perform by epoch. Otherwise, + by iteration. default: True. + save_best (str | None, optional): If a metric is specified, it + would measure the best checkpoint during evaluation. The + information about best checkpoint would be save in best.json. + Options are the evaluation metrics to the test dataset. e.g., + ``top1_acc``, ``top5_acc``, ``mean_class_accuracy``, + ``mean_average_precision``, ``mmit_mean_average_precision`` + for action recognition dataset (RawframeDataset and + VideoDataset). ``AR@AN``, ``auc`` for action localization + dataset (ActivityNetDataset). ``mAP@0.5IOU`` for + spatio-temporal action detection dataset (AVADataset). + If ``save_best`` is ``auto``, the first key of the returned + ``OrderedDict`` result will be used. Default: 'auto'. + rule (str | None, optional): Comparison rule for best score. If + set to None, it will infer a reasonable rule. Keys such as + 'acc', 'top' .etc will be inferred by 'greater' rule. Keys + contain 'loss' will be inferred by 'less' rule. Options are + 'greater', 'less', None. Default: None. + tmpdir (str | None): Temporary directory to save the results of all + processes. Default: None. + gpu_collect (bool): Whether to use gpu or cpu to collect results. + Default: False. + broadcast_bn_buffer (bool): Whether to broadcast the + buffer(running_mean and running_var) of rank 0 to other rank + before evaluation. Default: True. + **eval_kwargs: Evaluation arguments fed into the evaluate function + of the dataset. + """ + + def __init__(self, + dataloader, + start=None, + interval=1, + by_epoch=True, + save_best='auto', + rule=None, + broadcast_bn_buffer=True, + tmpdir=None, + gpu_collect=False, + **eval_kwargs): + super().__init__( + dataloader, + start=start, + interval=interval, + by_epoch=by_epoch, + save_best=save_best, + rule=rule, + **eval_kwargs) + self.broadcast_bn_buffer = broadcast_bn_buffer + self.tmpdir = tmpdir + self.gpu_collect = gpu_collect + + def _do_evaluate(self, runner): + """perform evaluation and save ckpt.""" + # Synchronization of BatchNorm's buffer (running_mean + # and running_var) is not supported in the DDP of pytorch, + # which may cause the inconsistent performance of models in + # different ranks, so we broadcast BatchNorm's buffers + # of rank 0 to other ranks to avoid this. + if self.broadcast_bn_buffer: + model = runner.model + for _, module in model.named_modules(): + if isinstance(module, + _BatchNorm) and module.track_running_stats: + dist.broadcast(module.running_var, 0) + dist.broadcast(module.running_mean, 0) + + if not self.evaluation_flag(runner): + return + + from mmaction.apis import multi_gpu_test + tmpdir = self.tmpdir + if tmpdir is None: + tmpdir = osp.join(runner.work_dir, '.eval_hook') + + results = multi_gpu_test( + runner.model, + self.dataloader, + tmpdir=tmpdir, + gpu_collect=self.gpu_collect) + if runner.rank == 0: + print('\n') + key_score = self.evaluate(runner, results) + + if self.save_best: + self._save_ckpt(runner, key_score)