From 2c3fd2b7cd379b42823cf0c59c8c82254baef7e5 Mon Sep 17 00:00:00 2001 From: Matvezy Date: Sat, 29 Mar 2025 00:53:04 +0000 Subject: [PATCH 01/20] early stopping --- rfdetr/main.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/rfdetr/main.py b/rfdetr/main.py index 7c18a25..4025094 100644 --- a/rfdetr/main.py +++ b/rfdetr/main.py @@ -133,10 +133,19 @@ def __init__(self, **kwargs): self.model.backbone[0].encoder = get_peft_model(self.model.backbone[0].encoder, lora_config) self.model = self.model.to(self.device) self.criterion, self.postprocessors = build_criterion_and_postprocessors(args) + self.stop_early = False + + def request_early_stop(self): + self.stop_early = True + print("Early stopping requested, will complete current epoch and stop") def reinitialize_detection_head(self, num_classes): self.model.reinitialize_detection_head(num_classes) + def request_early_stop(self): + self.stop_early = True + print("Early stopping requested, will complete current epoch and stop") + def train(self, callbacks: DefaultDict[str, List[Callable]], **kwargs): currently_supported_callbacks = ["on_fit_epoch_end", "on_train_batch_start", "on_train_end"] for key in callbacks.keys(): @@ -398,6 +407,10 @@ def lr_lambda(current_step: int): for callback in callbacks["on_fit_epoch_end"]: callback(log_stats) + if self.stop_early: + print(f"Early stopping requested, stopping at epoch {epoch}") + break + best_is_ema = best_map_ema_5095 > best_map_5095 if best_is_ema: shutil.copy2(output_dir / 'checkpoint_best_ema.pth', output_dir / 'checkpoint_best_total.pth') From 5e466521cbbdfe6eda10c73b53f9a2b80fb4248a Mon Sep 17 00:00:00 2001 From: Matvezy Date: Sat, 29 Mar 2025 21:38:25 +0000 Subject: [PATCH 02/20] mp upd --- rfdetr/main.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rfdetr/main.py b/rfdetr/main.py index 4025094..a62d8bf 100644 --- a/rfdetr/main.py +++ b/rfdetr/main.py @@ -45,6 +45,8 @@ import shutil from rfdetr.util.files import download_file import os +import torch.multiprocessing +torch.multiprocessing.set_sharing_strategy('file_system') logger = getLogger(__name__) From 84ad2ac7a461a036dd33fff860aeeaf74c53ea92 Mon Sep 17 00:00:00 2001 From: Matvezy Date: Mon, 31 Mar 2025 16:57:31 +0000 Subject: [PATCH 03/20] 100 eps --- rfdetr/main.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/rfdetr/main.py b/rfdetr/main.py index a62d8bf..0a32f5a 100644 --- a/rfdetr/main.py +++ b/rfdetr/main.py @@ -136,10 +136,6 @@ def __init__(self, **kwargs): self.model = self.model.to(self.device) self.criterion, self.postprocessors = build_criterion_and_postprocessors(args) self.stop_early = False - - def request_early_stop(self): - self.stop_early = True - print("Early stopping requested, will complete current epoch and stop") def reinitialize_detection_head(self, num_classes): self.model.reinitialize_detection_head(num_classes) From b15c33a9a89475de18b6576b18a26cbb2f997f30 Mon Sep 17 00:00:00 2001 From: Matvezy Date: Mon, 31 Mar 2025 17:03:21 +0000 Subject: [PATCH 04/20] early stopping --- rfdetr/main.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/rfdetr/main.py b/rfdetr/main.py index 0a32f5a..4f61f65 100644 --- a/rfdetr/main.py +++ b/rfdetr/main.py @@ -45,8 +45,9 @@ import shutil from rfdetr.util.files import download_file import os -import torch.multiprocessing -torch.multiprocessing.set_sharing_strategy('file_system') +if os.environ.get("USE_FILE_SYSTEM_SHARING", "0") == "1": + import torch.multiprocessing + torch.multiprocessing.set_sharing_strategy('file_system') logger = getLogger(__name__) From 4503135ab2d181ded9a691b6d11f63fc28185154 Mon Sep 17 00:00:00 2001 From: Matvezy Date: Mon, 31 Mar 2025 19:37:10 +0000 Subject: [PATCH 05/20] eraly stopping callback --- rfdetr/config.py | 4 +++ rfdetr/main.py | 29 +++++++++++++++++++ rfdetr/util/early_stopping.py | 54 +++++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+) create mode 100644 rfdetr/util/early_stopping.py diff --git a/rfdetr/config.py b/rfdetr/config.py index fcea48c..86079b0 100644 --- a/rfdetr/config.py +++ b/rfdetr/config.py @@ -70,3 +70,7 @@ class TrainConfig(BaseModel): use_ema: bool = True num_workers: int = 2 weight_decay: float = 1e-4 + early_stopping: bool = False + early_stopping_patience: int = 5 + early_stopping_min_delta: float = 0.001 + early_stopping_use_ema: bool = False diff --git a/rfdetr/main.py b/rfdetr/main.py index 4f61f65..894b6e2 100644 --- a/rfdetr/main.py +++ b/rfdetr/main.py @@ -159,6 +159,17 @@ def train(self, callbacks: DefaultDict[str, List[Callable]], **kwargs): print(args) device = torch.device(args.device) + # Initialize early stopping if enabled + if args.early_stopping: + from rfdetr.util.early_stopping import EarlyStoppingCallback + early_stopping_callback = EarlyStoppingCallback( + patience=args.early_stopping_patience, + min_delta=args.early_stopping_min_delta, + use_ema=args.early_stopping_use_ema + ) + early_stopping_callback.set_model(self) + callbacks["on_fit_epoch_end"].append(early_stopping_callback.update) + # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) @@ -752,6 +763,15 @@ def get_args_parser(): ) parser.add_argument('--lr_min_factor', default=0.0, type=float, help='Minimum learning rate factor (as a fraction of initial lr) at the end of cosine annealing') + # Early stopping parameters + parser.add_argument('--early_stopping', action='store_true', + help='Enable early stopping based on mAP improvement') + parser.add_argument('--early_stopping_patience', default=5, type=int, + help='Number of epochs with no improvement after which training will be stopped') + parser.add_argument('--early_stopping_min_delta', default=0.001, type=float, + help='Minimum change in mAP to qualify as an improvement') + parser.add_argument('--early_stopping_use_ema', action='store_true', + help='Use EMA model metrics for early stopping') # subparsers subparsers = parser.add_subparsers(title='sub-commands', dest='subcommand', description='valid subcommands', help='additional help') @@ -882,6 +902,11 @@ def populate_args( warmup_epochs=1, lr_scheduler='step', lr_min_factor=0.0, + # Early stopping parameters + early_stopping=False, + early_stopping_patience=5, + early_stopping_min_delta=0.001, + early_stopping_use_ema=False, # Additional subcommand=None, **extra_kwargs # To handle any unexpected arguments @@ -976,6 +1001,10 @@ def populate_args( warmup_epochs=warmup_epochs, lr_scheduler=lr_scheduler, lr_min_factor=lr_min_factor, + early_stopping=early_stopping, + early_stopping_patience=early_stopping_patience, + early_stopping_min_delta=early_stopping_min_delta, + early_stopping_use_ema=early_stopping_use_ema, **extra_kwargs ) return args \ No newline at end of file diff --git a/rfdetr/util/early_stopping.py b/rfdetr/util/early_stopping.py new file mode 100644 index 0000000..19ce3af --- /dev/null +++ b/rfdetr/util/early_stopping.py @@ -0,0 +1,54 @@ +""" +Early stopping callback for RF-DETR training +""" + +class EarlyStoppingCallback: + """ + Early stopping callback that monitors mAP and stops training if no improvement + over a threshold is observed for a specified number of epochs. + + Args: + patience (int): Number of epochs with no improvement to wait before stopping + min_delta (float): Minimum change in mAP to qualify as improvement + use_ema (bool): Whether to use EMA model metrics for early stopping + verbose (bool): Whether to print early stopping messages + """ + + def __init__(self, patience=5, min_delta=0.001, use_ema=False, verbose=True): + self.patience = patience + self.min_delta = min_delta + self.use_ema = use_ema + self.verbose = verbose + self.best_map = 0.0 + self.counter = 0 + self.stop_training = False + self.model = None + + def update(self, log_stats): + """Update early stopping state based on epoch validation metrics""" + if self.use_ema and 'ema_test_coco_eval_bbox' in log_stats: + current_map = log_stats['ema_test_coco_eval_bbox'][0] + elif 'test_coco_eval_bbox' in log_stats: + current_map = log_stats['test_coco_eval_bbox'][0] + else: + return + + if current_map > self.best_map + self.min_delta: + self.best_map = current_map + self.counter = 0 + if self.verbose: + print(f"Early stopping: mAP improved to {current_map:.4f}") + else: + self.counter += 1 + if self.verbose: + print(f"Early stopping: No improvement in mAP for {self.counter} epochs (best: {self.best_map:.4f}, current: {current_map:.4f})") + + if self.counter >= self.patience: + self.stop_training = True + print(f"Early stopping triggered: No improvement above {self.min_delta} threshold for {self.patience} epochs") + if self.model: + self.model.request_early_stop() + + def set_model(self, model): + """Set the model reference to call request_early_stop when needed""" + self.model = model \ No newline at end of file From c8c576a1040592a087f0b957ea09c2c81c40cdd2 Mon Sep 17 00:00:00 2001 From: Matvezy Date: Mon, 31 Mar 2025 20:38:10 +0000 Subject: [PATCH 06/20] eraly stopping callback --- rfdetr/detr.py | 10 ++++++++++ rfdetr/test_output/log.txt | 35 +++++++++++++++++++++++++++++++++++ rfdetr/util/early_stopping.py | 21 +++++++++++---------- 3 files changed, 56 insertions(+), 10 deletions(-) create mode 100644 rfdetr/test_output/log.txt diff --git a/rfdetr/detr.py b/rfdetr/detr.py index 10c2e28..a38bc4c 100644 --- a/rfdetr/detr.py +++ b/rfdetr/detr.py @@ -71,6 +71,16 @@ def train_from_config(self, config: TrainConfig, **kwargs): self.callbacks["on_fit_epoch_end"].append(metrics_tensor_board_sink.update) self.callbacks["on_train_end"].append(metrics_tensor_board_sink.close) + if config.early_stopping: + from rfdetr.util.early_stopping import EarlyStoppingCallback + early_stopping_callback = EarlyStoppingCallback( + model=self.model, + patience=config.early_stopping_patience, + min_delta=config.early_stopping_min_delta, + use_ema=config.early_stopping_use_ema + ) + self.callbacks["on_fit_epoch_end"].append(early_stopping_callback.update) + self.model.train( **all_kwargs, callbacks=self.callbacks, diff --git a/rfdetr/test_output/log.txt b/rfdetr/test_output/log.txt new file mode 100644 index 0000000..0731320 --- /dev/null +++ b/rfdetr/test_output/log.txt @@ -0,0 +1,35 @@ +{'epoch': 0, 'train_loss': 1.0, 'train_class_error': 0.5, 'test_loss': 1.2, 'test_coco_eval_bbox': [0.3, 0.24, 0.0, 0.0, 0.0, 0.0, 0.27], 'n_parameters': 1000000} +{'epoch': 1, 'train_loss': 0.5, 'train_class_error': 0.25, 'test_loss': 0.6, 'test_coco_eval_bbox': [0.32, 0.256, 0.0, 0.0, 0.0, 0.0, 0.28800000000000003], 'n_parameters': 1000000} +{'epoch': 2, 'train_loss': 0.3333333333333333, 'train_class_error': 0.16666666666666666, 'test_loss': 0.39999999999999997, 'test_coco_eval_bbox': [0.34, 0.272, 0.0, 0.0, 0.0, 0.0, 0.30600000000000005], 'n_parameters': 1000000} +{'epoch': 3, 'train_loss': 0.25, 'train_class_error': 0.125, 'test_loss': 0.3, 'test_coco_eval_bbox': [0.36, 0.288, 0.0, 0.0, 0.0, 0.0, 0.324], 'n_parameters': 1000000} +{'epoch': 4, 'train_loss': 0.2, 'train_class_error': 0.1, 'test_loss': 0.24, 'test_coco_eval_bbox': [0.38, 0.30400000000000005, 0.0, 0.0, 0.0, 0.0, 0.342], 'n_parameters': 1000000} +{'epoch': 5, 'train_loss': 0.16666666666666666, 'train_class_error': 0.08333333333333333, 'test_loss': 0.19999999999999998, 'test_coco_eval_bbox': [0.4, 0.32000000000000006, 0.0, 0.0, 0.0, 0.0, 0.36000000000000004], 'n_parameters': 1000000} +{'epoch': 6, 'train_loss': 0.14285714285714285, 'train_class_error': 0.07142857142857142, 'test_loss': 0.17142857142857143, 'test_coco_eval_bbox': [0.42, 0.336, 0.0, 0.0, 0.0, 0.0, 0.378], 'n_parameters': 1000000} +{'epoch': 7, 'train_loss': 0.125, 'train_class_error': 0.0625, 'test_loss': 0.15, 'test_coco_eval_bbox': [0.44, 0.35200000000000004, 0.0, 0.0, 0.0, 0.0, 0.396], 'n_parameters': 1000000} +{'epoch': 8, 'train_loss': 0.1111111111111111, 'train_class_error': 0.05555555555555555, 'test_loss': 0.13333333333333333, 'test_coco_eval_bbox': [0.46, 0.36800000000000005, 0.0, 0.0, 0.0, 0.0, 0.41400000000000003], 'n_parameters': 1000000} +{'epoch': 9, 'train_loss': 0.1, 'train_class_error': 0.05, 'test_loss': 0.12, 'test_coco_eval_bbox': [0.48, 0.384, 0.0, 0.0, 0.0, 0.0, 0.432], 'n_parameters': 1000000} +{'epoch': 0, 'train_loss': 1.0, 'train_class_error': 0.5, 'test_loss': 1.2, 'test_coco_eval_bbox': [0.3, 0.24, 0.0, 0.0, 0.0, 0.0, 0.27], 'n_parameters': 1000000} +{'epoch': 1, 'train_loss': 0.5, 'train_class_error': 0.25, 'test_loss': 0.6, 'test_coco_eval_bbox': [0.32, 0.256, 0.0, 0.0, 0.0, 0.0, 0.28800000000000003], 'n_parameters': 1000000} +{'epoch': 2, 'train_loss': 0.3333333333333333, 'train_class_error': 0.16666666666666666, 'test_loss': 0.39999999999999997, 'test_coco_eval_bbox': [0.34, 0.272, 0.0, 0.0, 0.0, 0.0, 0.30600000000000005], 'n_parameters': 1000000} +{'epoch': 3, 'train_loss': 0.25, 'train_class_error': 0.125, 'test_loss': 0.3, 'test_coco_eval_bbox': [0.341, 0.27280000000000004, 0.0, 0.0, 0.0, 0.0, 0.3069], 'n_parameters': 1000000} +{'epoch': 4, 'train_loss': 0.2, 'train_class_error': 0.1, 'test_loss': 0.24, 'test_coco_eval_bbox': [0.342, 0.2736, 0.0, 0.0, 0.0, 0.0, 0.3078], 'n_parameters': 1000000} +{'epoch': 5, 'train_loss': 0.16666666666666666, 'train_class_error': 0.08333333333333333, 'test_loss': 0.19999999999999998, 'test_coco_eval_bbox': [0.342, 0.2736, 0.0, 0.0, 0.0, 0.0, 0.3078], 'n_parameters': 1000000} +{'epoch': 0, 'train_loss': 1.0, 'train_class_error': 0.5, 'test_loss': 1.2, 'test_coco_eval_bbox': [0.3, 0.24, 0.0, 0.0, 0.0, 0.0, 0.27], 'n_parameters': 1000000} +{'epoch': 1, 'train_loss': 0.5, 'train_class_error': 0.25, 'test_loss': 0.6, 'test_coco_eval_bbox': [0.35, 0.27999999999999997, 0.0, 0.0, 0.0, 0.0, 0.315], 'n_parameters': 1000000} +{'epoch': 2, 'train_loss': 0.3333333333333333, 'train_class_error': 0.16666666666666666, 'test_loss': 0.39999999999999997, 'test_coco_eval_bbox': [0.4, 0.32000000000000006, 0.0, 0.0, 0.0, 0.0, 0.36000000000000004], 'n_parameters': 1000000} +{'epoch': 3, 'train_loss': 0.25, 'train_class_error': 0.125, 'test_loss': 0.3, 'test_coco_eval_bbox': [0.45, 0.36000000000000004, 0.0, 0.0, 0.0, 0.0, 0.405], 'n_parameters': 1000000} +{'epoch': 4, 'train_loss': 0.2, 'train_class_error': 0.1, 'test_loss': 0.24, 'test_coco_eval_bbox': [0.451, 0.3608, 0.0, 0.0, 0.0, 0.0, 0.40590000000000004], 'n_parameters': 1000000} +{'epoch': 5, 'train_loss': 0.16666666666666666, 'train_class_error': 0.08333333333333333, 'test_loss': 0.19999999999999998, 'test_coco_eval_bbox': [0.452, 0.36160000000000003, 0.0, 0.0, 0.0, 0.0, 0.4068], 'n_parameters': 1000000} +{'epoch': 6, 'train_loss': 0.14285714285714285, 'train_class_error': 0.07142857142857142, 'test_loss': 0.17142857142857143, 'test_coco_eval_bbox': [0.452, 0.36160000000000003, 0.0, 0.0, 0.0, 0.0, 0.4068], 'n_parameters': 1000000} +{'epoch': 0, 'train_loss': 1.0, 'train_class_error': 0.5, 'test_loss': 1.2, 'test_coco_eval_bbox': [0.3, 0.24, 0.0, 0.0, 0.0, 0.0, 0.27], 'n_parameters': 1000000} +{'epoch': 1, 'train_loss': 0.5, 'train_class_error': 0.25, 'test_loss': 0.6, 'test_coco_eval_bbox': [0.32, 0.256, 0.0, 0.0, 0.0, 0.0, 0.28800000000000003], 'n_parameters': 1000000} +{'epoch': 2, 'train_loss': 0.3333333333333333, 'train_class_error': 0.16666666666666666, 'test_loss': 0.39999999999999997, 'test_coco_eval_bbox': [0.34, 0.272, 0.0, 0.0, 0.0, 0.0, 0.30600000000000005], 'n_parameters': 1000000} +{'epoch': 3, 'train_loss': 0.25, 'train_class_error': 0.125, 'test_loss': 0.3, 'test_coco_eval_bbox': [0.33, 0.264, 0.0, 0.0, 0.0, 0.0, 0.29700000000000004], 'n_parameters': 1000000} +{'epoch': 4, 'train_loss': 0.2, 'train_class_error': 0.1, 'test_loss': 0.24, 'test_coco_eval_bbox': [0.32, 0.256, 0.0, 0.0, 0.0, 0.0, 0.28800000000000003], 'n_parameters': 1000000} +{'epoch': 5, 'train_loss': 0.16666666666666666, 'train_class_error': 0.08333333333333333, 'test_loss': 0.19999999999999998, 'test_coco_eval_bbox': [0.31, 0.248, 0.0, 0.0, 0.0, 0.0, 0.279], 'n_parameters': 1000000} +{'epoch': 0, 'train_loss': 1.0, 'train_class_error': 0.5, 'test_loss': 1.2, 'test_coco_eval_bbox': [0.3, 0.24, 0.0, 0.0, 0.0, 0.0, 0.27], 'n_parameters': 1000000, 'ema_test_coco_eval_bbox': [0.315, 0.252, 0.0, 0.0, 0.0, 0.0, 0.28350000000000003]} +{'epoch': 1, 'train_loss': 0.5, 'train_class_error': 0.25, 'test_loss': 0.6, 'test_coco_eval_bbox': [0.32, 0.256, 0.0, 0.0, 0.0, 0.0, 0.28800000000000003], 'n_parameters': 1000000, 'ema_test_coco_eval_bbox': [0.336, 0.26880000000000004, 0.0, 0.0, 0.0, 0.0, 0.3024]} +{'epoch': 2, 'train_loss': 0.3333333333333333, 'train_class_error': 0.16666666666666666, 'test_loss': 0.39999999999999997, 'test_coco_eval_bbox': [0.34, 0.272, 0.0, 0.0, 0.0, 0.0, 0.30600000000000005], 'n_parameters': 1000000, 'ema_test_coco_eval_bbox': [0.35700000000000004, 0.2856, 0.0, 0.0, 0.0, 0.0, 0.32130000000000003]} +{'epoch': 3, 'train_loss': 0.25, 'train_class_error': 0.125, 'test_loss': 0.3, 'test_coco_eval_bbox': [0.341, 0.27280000000000004, 0.0, 0.0, 0.0, 0.0, 0.3069], 'n_parameters': 1000000, 'ema_test_coco_eval_bbox': [0.35805000000000003, 0.28644000000000003, 0.0, 0.0, 0.0, 0.0, 0.32224500000000006]} +{'epoch': 4, 'train_loss': 0.2, 'train_class_error': 0.1, 'test_loss': 0.24, 'test_coco_eval_bbox': [0.342, 0.2736, 0.0, 0.0, 0.0, 0.0, 0.3078], 'n_parameters': 1000000, 'ema_test_coco_eval_bbox': [0.35910000000000003, 0.28728000000000004, 0.0, 0.0, 0.0, 0.0, 0.32319000000000003]} +{'epoch': 5, 'train_loss': 0.16666666666666666, 'train_class_error': 0.08333333333333333, 'test_loss': 0.19999999999999998, 'test_coco_eval_bbox': [0.342, 0.2736, 0.0, 0.0, 0.0, 0.0, 0.3078], 'n_parameters': 1000000, 'ema_test_coco_eval_bbox': [0.35910000000000003, 0.28728000000000004, 0.0, 0.0, 0.0, 0.0, 0.32319000000000003]} diff --git a/rfdetr/util/early_stopping.py b/rfdetr/util/early_stopping.py index 19ce3af..413dc58 100644 --- a/rfdetr/util/early_stopping.py +++ b/rfdetr/util/early_stopping.py @@ -14,41 +14,42 @@ class EarlyStoppingCallback: verbose (bool): Whether to print early stopping messages """ - def __init__(self, patience=5, min_delta=0.001, use_ema=False, verbose=True): + def __init__(self, model, patience=5, min_delta=0.001, use_ema=False, verbose=True): self.patience = patience self.min_delta = min_delta self.use_ema = use_ema self.verbose = verbose self.best_map = 0.0 self.counter = 0 - self.stop_training = False - self.model = None + self.model = model def update(self, log_stats): """Update early stopping state based on epoch validation metrics""" + # Get the mAP value from the log stats if self.use_ema and 'ema_test_coco_eval_bbox' in log_stats: current_map = log_stats['ema_test_coco_eval_bbox'][0] elif 'test_coco_eval_bbox' in log_stats: current_map = log_stats['test_coco_eval_bbox'][0] else: + # No valid mAP metric found, skip early stopping check return + # Check if current mAP is better than best so far (by at least min_delta) if current_map > self.best_map + self.min_delta: + # We have an improvement self.best_map = current_map self.counter = 0 if self.verbose: print(f"Early stopping: mAP improved to {current_map:.4f}") else: + # No improvement self.counter += 1 if self.verbose: print(f"Early stopping: No improvement in mAP for {self.counter} epochs (best: {self.best_map:.4f}, current: {current_map:.4f})") - + + # Check if early stopping criteria met if self.counter >= self.patience: - self.stop_training = True print(f"Early stopping triggered: No improvement above {self.min_delta} threshold for {self.patience} epochs") + # Request model to stop early if self.model: - self.model.request_early_stop() - - def set_model(self, model): - """Set the model reference to call request_early_stop when needed""" - self.model = model \ No newline at end of file + self.model.request_early_stop() \ No newline at end of file From 468c5251330a1b513c64f546e7de3cd1bd747a7f Mon Sep 17 00:00:00 2001 From: Matvezy Date: Mon, 31 Mar 2025 22:18:21 +0000 Subject: [PATCH 07/20] trun early stopping on by default --- rfdetr/main.py | 2 +- rfdetr/test_early_stopping.py | 261 ++++++++++++++++++++++++++++++++++ rfdetr/test_output/log.txt | 35 +++++ 3 files changed, 297 insertions(+), 1 deletion(-) create mode 100644 rfdetr/test_early_stopping.py diff --git a/rfdetr/main.py b/rfdetr/main.py index 894b6e2..04527e9 100644 --- a/rfdetr/main.py +++ b/rfdetr/main.py @@ -903,7 +903,7 @@ def populate_args( lr_scheduler='step', lr_min_factor=0.0, # Early stopping parameters - early_stopping=False, + early_stopping=True, early_stopping_patience=5, early_stopping_min_delta=0.001, early_stopping_use_ema=False, diff --git a/rfdetr/test_early_stopping.py b/rfdetr/test_early_stopping.py new file mode 100644 index 0000000..31b995b --- /dev/null +++ b/rfdetr/test_early_stopping.py @@ -0,0 +1,261 @@ +import sys +import os +import time +import torch +import numpy as np +from pathlib import Path +from collections import defaultdict + +# Add the project root to path so we can import the code +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from rfdetr.main import Model, populate_args +from rfdetr.util.early_stopping import EarlyStoppingCallback + +class MockModel: + """Mock model that simulates the Model class but doesn't build a real model""" + + def __init__(self, map_values, **kwargs): + """ + Args: + map_values: List of mAP values to return for each epoch + **kwargs: Arguments to pass to populate_args + """ + self.map_values = map_values + self.args = populate_args(**kwargs) + self.stop_early = False + self.current_epoch = 0 + + def request_early_stop(self): + """Same method as Model.request_early_stop""" + self.stop_early = True + print("Early stopping requested, will complete current epoch and stop") + + def train(self, callbacks=None, **kwargs): + """Simulated train method that follows the same pattern as Model.train""" + if callbacks is None: + callbacks = defaultdict(list) + + # Set up the parameters + args = populate_args(**kwargs) + + # We need a valid output directory for logs + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + print("\n===== Testing Early Stopping with Mock Model =====") + print(f"Using map_values: {self.map_values}") + if hasattr(args, 'early_stopping') and args.early_stopping: + print(f"Early stopping params: patience={args.early_stopping_patience}, min_delta={args.early_stopping_min_delta}") + + print("\nStarting mock training...") + start_time = time.time() + + for epoch in range(min(args.epochs, len(self.map_values))): + self.current_epoch = epoch + + # Simulate one epoch of training + epoch_start_time = time.time() + time.sleep(0.2) # To make output more readable + + # Generate mock train stats + train_stats = { + 'loss': 1.0 / (epoch + 1), # Decreasing loss + 'class_error': 0.5 / (epoch + 1) + } + + # Generate mock evaluation stats with the pre-defined mAP + map_value = self.map_values[epoch] + test_stats = { + 'loss': 1.2 / (epoch + 1), + 'coco_eval_bbox': [map_value, map_value * 0.8, 0.0, 0.0, 0.0, 0.0, map_value * 0.9] + } + + # Create log stats dictionary similar to the real train method + log_stats = { + 'epoch': epoch, + **{f'train_{k}': v for k, v in train_stats.items()}, + **{f'test_{k}': v for k, v in test_stats.items()}, + 'n_parameters': 1000000 # Dummy value + } + + if args.use_ema: + # Add EMA metrics (slightly better than regular metrics) + ema_map = map_value * 1.05 + log_stats['ema_test_coco_eval_bbox'] = [ + ema_map, ema_map * 0.8, 0.0, 0.0, 0.0, 0.0, ema_map * 0.9 + ] + + print(f"Epoch {epoch}: mAP = {map_value:.4f}") + + # Write the log file similar to the real train method + if args.output_dir: + with (output_dir / "log.txt").open("a") as f: + f.write(f"{str(log_stats)}\n") + + # Call the on_fit_epoch_end callbacks + for callback in callbacks["on_fit_epoch_end"]: + callback(log_stats) + + # Check if early stopping was triggered + if self.stop_early: + print(f"\nāœ… Early stopping triggered after epoch {epoch}") + break + else: + print("\nāŒ Early stopping was not triggered") + + total_time = time.time() - start_time + print(f"Training completed in {total_time:.2f} seconds") + +# Test scenarios with different mAP patterns + +def test_scenario_1(): + """Steady improvement, no early stopping expected""" + map_values = [0.30, 0.32, 0.34, 0.36, 0.38, 0.40, 0.42, 0.44, 0.46, 0.48] + model = MockModel(map_values=map_values, num_classes=2) + + # Initialize callbacks - this simulates what happens in detr.py + callbacks = defaultdict(list) + + # Initialize early stopping callback - similar to how it would be done in detr.py + early_stopping_callback = EarlyStoppingCallback( + model=model, # Pass model directly now + patience=3, + min_delta=0.005, + use_ema=False + ) + callbacks["on_fit_epoch_end"].append(early_stopping_callback.update) + + model.train( + callbacks=callbacks, + epochs=10, + output_dir="test_output", + early_stopping=True, + early_stopping_patience=3, + early_stopping_min_delta=0.005 + ) + +def test_scenario_2(): + """Early plateau, should trigger early stopping""" + map_values = [0.30, 0.32, 0.34, 0.341, 0.342, 0.342, 0.343, 0.343, 0.344, 0.344] + model = MockModel(map_values=map_values, num_classes=2) + + # Initialize callbacks + callbacks = defaultdict(list) + + # Initialize early stopping callback + early_stopping_callback = EarlyStoppingCallback( + model=model, + patience=3, + min_delta=0.005, + use_ema=False + ) + callbacks["on_fit_epoch_end"].append(early_stopping_callback.update) + + model.train( + callbacks=callbacks, + epochs=10, + output_dir="test_output", + early_stopping=True, + early_stopping_patience=3, + early_stopping_min_delta=0.005 + ) + +def test_scenario_3(): + """Initial improvement then plateau""" + map_values = [0.30, 0.35, 0.40, 0.45, 0.451, 0.452, 0.452, 0.453, 0.453, 0.454] + model = MockModel(map_values=map_values, num_classes=2) + + # Initialize callbacks + callbacks = defaultdict(list) + + # Initialize early stopping callback + early_stopping_callback = EarlyStoppingCallback( + model=model, + patience=3, + min_delta=0.005, + use_ema=False + ) + callbacks["on_fit_epoch_end"].append(early_stopping_callback.update) + + model.train( + callbacks=callbacks, + epochs=10, + output_dir="test_output", + early_stopping=True, + early_stopping_patience=3, + early_stopping_min_delta=0.005 + ) + +def test_scenario_4(): + """Decreasing performance""" + map_values = [0.30, 0.32, 0.34, 0.33, 0.32, 0.31, 0.30, 0.29, 0.28, 0.27] + model = MockModel(map_values=map_values, num_classes=2) + + # Initialize callbacks + callbacks = defaultdict(list) + + # Initialize early stopping callback + early_stopping_callback = EarlyStoppingCallback( + model=model, + patience=3, + min_delta=0.005, + use_ema=False + ) + callbacks["on_fit_epoch_end"].append(early_stopping_callback.update) + + model.train( + callbacks=callbacks, + epochs=10, + output_dir="test_output", + early_stopping=True, + early_stopping_patience=3, + early_stopping_min_delta=0.005 + ) + +def test_scenario_5(): + """With EMA metrics""" + map_values = [0.30, 0.32, 0.34, 0.341, 0.342, 0.342, 0.343, 0.343, 0.344, 0.344] + model = MockModel(map_values=map_values, num_classes=2) + + # Initialize callbacks + callbacks = defaultdict(list) + + # Initialize early stopping callback with EMA + early_stopping_callback = EarlyStoppingCallback( + model=model, + patience=3, + min_delta=0.005, + use_ema=True + ) + callbacks["on_fit_epoch_end"].append(early_stopping_callback.update) + + model.train( + callbacks=callbacks, + epochs=10, + output_dir="test_output", + use_ema=True, + early_stopping=True, + early_stopping_patience=3, + early_stopping_min_delta=0.005, + early_stopping_use_ema=True + ) + +if __name__ == "__main__": + # Make sure the output directory exists + os.makedirs("test_output", exist_ok=True) + + print("\n\nšŸ” SCENARIO 1: Steady improvement, no early stopping") + test_scenario_1() + + print("\n\nšŸ” SCENARIO 2: Early plateau, should trigger early stopping") + test_scenario_2() + + print("\n\nšŸ” SCENARIO 3: Initial improvement then plateau") + test_scenario_3() + + print("\n\nšŸ” SCENARIO 4: Decreasing performance") + test_scenario_4() + + print("\n\nšŸ” SCENARIO 5: Using EMA metrics") + test_scenario_5() \ No newline at end of file diff --git a/rfdetr/test_output/log.txt b/rfdetr/test_output/log.txt index 0731320..67e9ee3 100644 --- a/rfdetr/test_output/log.txt +++ b/rfdetr/test_output/log.txt @@ -33,3 +33,38 @@ {'epoch': 3, 'train_loss': 0.25, 'train_class_error': 0.125, 'test_loss': 0.3, 'test_coco_eval_bbox': [0.341, 0.27280000000000004, 0.0, 0.0, 0.0, 0.0, 0.3069], 'n_parameters': 1000000, 'ema_test_coco_eval_bbox': [0.35805000000000003, 0.28644000000000003, 0.0, 0.0, 0.0, 0.0, 0.32224500000000006]} {'epoch': 4, 'train_loss': 0.2, 'train_class_error': 0.1, 'test_loss': 0.24, 'test_coco_eval_bbox': [0.342, 0.2736, 0.0, 0.0, 0.0, 0.0, 0.3078], 'n_parameters': 1000000, 'ema_test_coco_eval_bbox': [0.35910000000000003, 0.28728000000000004, 0.0, 0.0, 0.0, 0.0, 0.32319000000000003]} {'epoch': 5, 'train_loss': 0.16666666666666666, 'train_class_error': 0.08333333333333333, 'test_loss': 0.19999999999999998, 'test_coco_eval_bbox': [0.342, 0.2736, 0.0, 0.0, 0.0, 0.0, 0.3078], 'n_parameters': 1000000, 'ema_test_coco_eval_bbox': [0.35910000000000003, 0.28728000000000004, 0.0, 0.0, 0.0, 0.0, 0.32319000000000003]} +{'epoch': 0, 'train_loss': 1.0, 'train_class_error': 0.5, 'test_loss': 1.2, 'test_coco_eval_bbox': [0.3, 0.24, 0.0, 0.0, 0.0, 0.0, 0.27], 'n_parameters': 1000000} +{'epoch': 1, 'train_loss': 0.5, 'train_class_error': 0.25, 'test_loss': 0.6, 'test_coco_eval_bbox': [0.32, 0.256, 0.0, 0.0, 0.0, 0.0, 0.28800000000000003], 'n_parameters': 1000000} +{'epoch': 2, 'train_loss': 0.3333333333333333, 'train_class_error': 0.16666666666666666, 'test_loss': 0.39999999999999997, 'test_coco_eval_bbox': [0.34, 0.272, 0.0, 0.0, 0.0, 0.0, 0.30600000000000005], 'n_parameters': 1000000} +{'epoch': 3, 'train_loss': 0.25, 'train_class_error': 0.125, 'test_loss': 0.3, 'test_coco_eval_bbox': [0.36, 0.288, 0.0, 0.0, 0.0, 0.0, 0.324], 'n_parameters': 1000000} +{'epoch': 4, 'train_loss': 0.2, 'train_class_error': 0.1, 'test_loss': 0.24, 'test_coco_eval_bbox': [0.38, 0.30400000000000005, 0.0, 0.0, 0.0, 0.0, 0.342], 'n_parameters': 1000000} +{'epoch': 5, 'train_loss': 0.16666666666666666, 'train_class_error': 0.08333333333333333, 'test_loss': 0.19999999999999998, 'test_coco_eval_bbox': [0.4, 0.32000000000000006, 0.0, 0.0, 0.0, 0.0, 0.36000000000000004], 'n_parameters': 1000000} +{'epoch': 6, 'train_loss': 0.14285714285714285, 'train_class_error': 0.07142857142857142, 'test_loss': 0.17142857142857143, 'test_coco_eval_bbox': [0.42, 0.336, 0.0, 0.0, 0.0, 0.0, 0.378], 'n_parameters': 1000000} +{'epoch': 7, 'train_loss': 0.125, 'train_class_error': 0.0625, 'test_loss': 0.15, 'test_coco_eval_bbox': [0.44, 0.35200000000000004, 0.0, 0.0, 0.0, 0.0, 0.396], 'n_parameters': 1000000} +{'epoch': 8, 'train_loss': 0.1111111111111111, 'train_class_error': 0.05555555555555555, 'test_loss': 0.13333333333333333, 'test_coco_eval_bbox': [0.46, 0.36800000000000005, 0.0, 0.0, 0.0, 0.0, 0.41400000000000003], 'n_parameters': 1000000} +{'epoch': 9, 'train_loss': 0.1, 'train_class_error': 0.05, 'test_loss': 0.12, 'test_coco_eval_bbox': [0.48, 0.384, 0.0, 0.0, 0.0, 0.0, 0.432], 'n_parameters': 1000000} +{'epoch': 0, 'train_loss': 1.0, 'train_class_error': 0.5, 'test_loss': 1.2, 'test_coco_eval_bbox': [0.3, 0.24, 0.0, 0.0, 0.0, 0.0, 0.27], 'n_parameters': 1000000} +{'epoch': 1, 'train_loss': 0.5, 'train_class_error': 0.25, 'test_loss': 0.6, 'test_coco_eval_bbox': [0.32, 0.256, 0.0, 0.0, 0.0, 0.0, 0.28800000000000003], 'n_parameters': 1000000} +{'epoch': 2, 'train_loss': 0.3333333333333333, 'train_class_error': 0.16666666666666666, 'test_loss': 0.39999999999999997, 'test_coco_eval_bbox': [0.34, 0.272, 0.0, 0.0, 0.0, 0.0, 0.30600000000000005], 'n_parameters': 1000000} +{'epoch': 3, 'train_loss': 0.25, 'train_class_error': 0.125, 'test_loss': 0.3, 'test_coco_eval_bbox': [0.341, 0.27280000000000004, 0.0, 0.0, 0.0, 0.0, 0.3069], 'n_parameters': 1000000} +{'epoch': 4, 'train_loss': 0.2, 'train_class_error': 0.1, 'test_loss': 0.24, 'test_coco_eval_bbox': [0.342, 0.2736, 0.0, 0.0, 0.0, 0.0, 0.3078], 'n_parameters': 1000000} +{'epoch': 5, 'train_loss': 0.16666666666666666, 'train_class_error': 0.08333333333333333, 'test_loss': 0.19999999999999998, 'test_coco_eval_bbox': [0.342, 0.2736, 0.0, 0.0, 0.0, 0.0, 0.3078], 'n_parameters': 1000000} +{'epoch': 0, 'train_loss': 1.0, 'train_class_error': 0.5, 'test_loss': 1.2, 'test_coco_eval_bbox': [0.3, 0.24, 0.0, 0.0, 0.0, 0.0, 0.27], 'n_parameters': 1000000} +{'epoch': 1, 'train_loss': 0.5, 'train_class_error': 0.25, 'test_loss': 0.6, 'test_coco_eval_bbox': [0.35, 0.27999999999999997, 0.0, 0.0, 0.0, 0.0, 0.315], 'n_parameters': 1000000} +{'epoch': 2, 'train_loss': 0.3333333333333333, 'train_class_error': 0.16666666666666666, 'test_loss': 0.39999999999999997, 'test_coco_eval_bbox': [0.4, 0.32000000000000006, 0.0, 0.0, 0.0, 0.0, 0.36000000000000004], 'n_parameters': 1000000} +{'epoch': 3, 'train_loss': 0.25, 'train_class_error': 0.125, 'test_loss': 0.3, 'test_coco_eval_bbox': [0.45, 0.36000000000000004, 0.0, 0.0, 0.0, 0.0, 0.405], 'n_parameters': 1000000} +{'epoch': 4, 'train_loss': 0.2, 'train_class_error': 0.1, 'test_loss': 0.24, 'test_coco_eval_bbox': [0.451, 0.3608, 0.0, 0.0, 0.0, 0.0, 0.40590000000000004], 'n_parameters': 1000000} +{'epoch': 5, 'train_loss': 0.16666666666666666, 'train_class_error': 0.08333333333333333, 'test_loss': 0.19999999999999998, 'test_coco_eval_bbox': [0.452, 0.36160000000000003, 0.0, 0.0, 0.0, 0.0, 0.4068], 'n_parameters': 1000000} +{'epoch': 6, 'train_loss': 0.14285714285714285, 'train_class_error': 0.07142857142857142, 'test_loss': 0.17142857142857143, 'test_coco_eval_bbox': [0.452, 0.36160000000000003, 0.0, 0.0, 0.0, 0.0, 0.4068], 'n_parameters': 1000000} +{'epoch': 0, 'train_loss': 1.0, 'train_class_error': 0.5, 'test_loss': 1.2, 'test_coco_eval_bbox': [0.3, 0.24, 0.0, 0.0, 0.0, 0.0, 0.27], 'n_parameters': 1000000} +{'epoch': 1, 'train_loss': 0.5, 'train_class_error': 0.25, 'test_loss': 0.6, 'test_coco_eval_bbox': [0.32, 0.256, 0.0, 0.0, 0.0, 0.0, 0.28800000000000003], 'n_parameters': 1000000} +{'epoch': 2, 'train_loss': 0.3333333333333333, 'train_class_error': 0.16666666666666666, 'test_loss': 0.39999999999999997, 'test_coco_eval_bbox': [0.34, 0.272, 0.0, 0.0, 0.0, 0.0, 0.30600000000000005], 'n_parameters': 1000000} +{'epoch': 3, 'train_loss': 0.25, 'train_class_error': 0.125, 'test_loss': 0.3, 'test_coco_eval_bbox': [0.33, 0.264, 0.0, 0.0, 0.0, 0.0, 0.29700000000000004], 'n_parameters': 1000000} +{'epoch': 4, 'train_loss': 0.2, 'train_class_error': 0.1, 'test_loss': 0.24, 'test_coco_eval_bbox': [0.32, 0.256, 0.0, 0.0, 0.0, 0.0, 0.28800000000000003], 'n_parameters': 1000000} +{'epoch': 5, 'train_loss': 0.16666666666666666, 'train_class_error': 0.08333333333333333, 'test_loss': 0.19999999999999998, 'test_coco_eval_bbox': [0.31, 0.248, 0.0, 0.0, 0.0, 0.0, 0.279], 'n_parameters': 1000000} +{'epoch': 0, 'train_loss': 1.0, 'train_class_error': 0.5, 'test_loss': 1.2, 'test_coco_eval_bbox': [0.3, 0.24, 0.0, 0.0, 0.0, 0.0, 0.27], 'n_parameters': 1000000, 'ema_test_coco_eval_bbox': [0.315, 0.252, 0.0, 0.0, 0.0, 0.0, 0.28350000000000003]} +{'epoch': 1, 'train_loss': 0.5, 'train_class_error': 0.25, 'test_loss': 0.6, 'test_coco_eval_bbox': [0.32, 0.256, 0.0, 0.0, 0.0, 0.0, 0.28800000000000003], 'n_parameters': 1000000, 'ema_test_coco_eval_bbox': [0.336, 0.26880000000000004, 0.0, 0.0, 0.0, 0.0, 0.3024]} +{'epoch': 2, 'train_loss': 0.3333333333333333, 'train_class_error': 0.16666666666666666, 'test_loss': 0.39999999999999997, 'test_coco_eval_bbox': [0.34, 0.272, 0.0, 0.0, 0.0, 0.0, 0.30600000000000005], 'n_parameters': 1000000, 'ema_test_coco_eval_bbox': [0.35700000000000004, 0.2856, 0.0, 0.0, 0.0, 0.0, 0.32130000000000003]} +{'epoch': 3, 'train_loss': 0.25, 'train_class_error': 0.125, 'test_loss': 0.3, 'test_coco_eval_bbox': [0.341, 0.27280000000000004, 0.0, 0.0, 0.0, 0.0, 0.3069], 'n_parameters': 1000000, 'ema_test_coco_eval_bbox': [0.35805000000000003, 0.28644000000000003, 0.0, 0.0, 0.0, 0.0, 0.32224500000000006]} +{'epoch': 4, 'train_loss': 0.2, 'train_class_error': 0.1, 'test_loss': 0.24, 'test_coco_eval_bbox': [0.342, 0.2736, 0.0, 0.0, 0.0, 0.0, 0.3078], 'n_parameters': 1000000, 'ema_test_coco_eval_bbox': [0.35910000000000003, 0.28728000000000004, 0.0, 0.0, 0.0, 0.0, 0.32319000000000003]} +{'epoch': 5, 'train_loss': 0.16666666666666666, 'train_class_error': 0.08333333333333333, 'test_loss': 0.19999999999999998, 'test_coco_eval_bbox': [0.342, 0.2736, 0.0, 0.0, 0.0, 0.0, 0.3078], 'n_parameters': 1000000, 'ema_test_coco_eval_bbox': [0.35910000000000003, 0.28728000000000004, 0.0, 0.0, 0.0, 0.0, 0.32319000000000003]} From ecc0ee9c504934f96c3aca735576f092a9d84fe9 Mon Sep 17 00:00:00 2001 From: Matvei Popov <46304340+Matvezy@users.noreply.github.com> Date: Mon, 31 Mar 2025 15:19:11 -0700 Subject: [PATCH 08/20] Delete rfdetr/test_early_stopping.py --- rfdetr/test_early_stopping.py | 261 ---------------------------------- 1 file changed, 261 deletions(-) delete mode 100644 rfdetr/test_early_stopping.py diff --git a/rfdetr/test_early_stopping.py b/rfdetr/test_early_stopping.py deleted file mode 100644 index 31b995b..0000000 --- a/rfdetr/test_early_stopping.py +++ /dev/null @@ -1,261 +0,0 @@ -import sys -import os -import time -import torch -import numpy as np -from pathlib import Path -from collections import defaultdict - -# Add the project root to path so we can import the code -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from rfdetr.main import Model, populate_args -from rfdetr.util.early_stopping import EarlyStoppingCallback - -class MockModel: - """Mock model that simulates the Model class but doesn't build a real model""" - - def __init__(self, map_values, **kwargs): - """ - Args: - map_values: List of mAP values to return for each epoch - **kwargs: Arguments to pass to populate_args - """ - self.map_values = map_values - self.args = populate_args(**kwargs) - self.stop_early = False - self.current_epoch = 0 - - def request_early_stop(self): - """Same method as Model.request_early_stop""" - self.stop_early = True - print("Early stopping requested, will complete current epoch and stop") - - def train(self, callbacks=None, **kwargs): - """Simulated train method that follows the same pattern as Model.train""" - if callbacks is None: - callbacks = defaultdict(list) - - # Set up the parameters - args = populate_args(**kwargs) - - # We need a valid output directory for logs - output_dir = Path(args.output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - print("\n===== Testing Early Stopping with Mock Model =====") - print(f"Using map_values: {self.map_values}") - if hasattr(args, 'early_stopping') and args.early_stopping: - print(f"Early stopping params: patience={args.early_stopping_patience}, min_delta={args.early_stopping_min_delta}") - - print("\nStarting mock training...") - start_time = time.time() - - for epoch in range(min(args.epochs, len(self.map_values))): - self.current_epoch = epoch - - # Simulate one epoch of training - epoch_start_time = time.time() - time.sleep(0.2) # To make output more readable - - # Generate mock train stats - train_stats = { - 'loss': 1.0 / (epoch + 1), # Decreasing loss - 'class_error': 0.5 / (epoch + 1) - } - - # Generate mock evaluation stats with the pre-defined mAP - map_value = self.map_values[epoch] - test_stats = { - 'loss': 1.2 / (epoch + 1), - 'coco_eval_bbox': [map_value, map_value * 0.8, 0.0, 0.0, 0.0, 0.0, map_value * 0.9] - } - - # Create log stats dictionary similar to the real train method - log_stats = { - 'epoch': epoch, - **{f'train_{k}': v for k, v in train_stats.items()}, - **{f'test_{k}': v for k, v in test_stats.items()}, - 'n_parameters': 1000000 # Dummy value - } - - if args.use_ema: - # Add EMA metrics (slightly better than regular metrics) - ema_map = map_value * 1.05 - log_stats['ema_test_coco_eval_bbox'] = [ - ema_map, ema_map * 0.8, 0.0, 0.0, 0.0, 0.0, ema_map * 0.9 - ] - - print(f"Epoch {epoch}: mAP = {map_value:.4f}") - - # Write the log file similar to the real train method - if args.output_dir: - with (output_dir / "log.txt").open("a") as f: - f.write(f"{str(log_stats)}\n") - - # Call the on_fit_epoch_end callbacks - for callback in callbacks["on_fit_epoch_end"]: - callback(log_stats) - - # Check if early stopping was triggered - if self.stop_early: - print(f"\nāœ… Early stopping triggered after epoch {epoch}") - break - else: - print("\nāŒ Early stopping was not triggered") - - total_time = time.time() - start_time - print(f"Training completed in {total_time:.2f} seconds") - -# Test scenarios with different mAP patterns - -def test_scenario_1(): - """Steady improvement, no early stopping expected""" - map_values = [0.30, 0.32, 0.34, 0.36, 0.38, 0.40, 0.42, 0.44, 0.46, 0.48] - model = MockModel(map_values=map_values, num_classes=2) - - # Initialize callbacks - this simulates what happens in detr.py - callbacks = defaultdict(list) - - # Initialize early stopping callback - similar to how it would be done in detr.py - early_stopping_callback = EarlyStoppingCallback( - model=model, # Pass model directly now - patience=3, - min_delta=0.005, - use_ema=False - ) - callbacks["on_fit_epoch_end"].append(early_stopping_callback.update) - - model.train( - callbacks=callbacks, - epochs=10, - output_dir="test_output", - early_stopping=True, - early_stopping_patience=3, - early_stopping_min_delta=0.005 - ) - -def test_scenario_2(): - """Early plateau, should trigger early stopping""" - map_values = [0.30, 0.32, 0.34, 0.341, 0.342, 0.342, 0.343, 0.343, 0.344, 0.344] - model = MockModel(map_values=map_values, num_classes=2) - - # Initialize callbacks - callbacks = defaultdict(list) - - # Initialize early stopping callback - early_stopping_callback = EarlyStoppingCallback( - model=model, - patience=3, - min_delta=0.005, - use_ema=False - ) - callbacks["on_fit_epoch_end"].append(early_stopping_callback.update) - - model.train( - callbacks=callbacks, - epochs=10, - output_dir="test_output", - early_stopping=True, - early_stopping_patience=3, - early_stopping_min_delta=0.005 - ) - -def test_scenario_3(): - """Initial improvement then plateau""" - map_values = [0.30, 0.35, 0.40, 0.45, 0.451, 0.452, 0.452, 0.453, 0.453, 0.454] - model = MockModel(map_values=map_values, num_classes=2) - - # Initialize callbacks - callbacks = defaultdict(list) - - # Initialize early stopping callback - early_stopping_callback = EarlyStoppingCallback( - model=model, - patience=3, - min_delta=0.005, - use_ema=False - ) - callbacks["on_fit_epoch_end"].append(early_stopping_callback.update) - - model.train( - callbacks=callbacks, - epochs=10, - output_dir="test_output", - early_stopping=True, - early_stopping_patience=3, - early_stopping_min_delta=0.005 - ) - -def test_scenario_4(): - """Decreasing performance""" - map_values = [0.30, 0.32, 0.34, 0.33, 0.32, 0.31, 0.30, 0.29, 0.28, 0.27] - model = MockModel(map_values=map_values, num_classes=2) - - # Initialize callbacks - callbacks = defaultdict(list) - - # Initialize early stopping callback - early_stopping_callback = EarlyStoppingCallback( - model=model, - patience=3, - min_delta=0.005, - use_ema=False - ) - callbacks["on_fit_epoch_end"].append(early_stopping_callback.update) - - model.train( - callbacks=callbacks, - epochs=10, - output_dir="test_output", - early_stopping=True, - early_stopping_patience=3, - early_stopping_min_delta=0.005 - ) - -def test_scenario_5(): - """With EMA metrics""" - map_values = [0.30, 0.32, 0.34, 0.341, 0.342, 0.342, 0.343, 0.343, 0.344, 0.344] - model = MockModel(map_values=map_values, num_classes=2) - - # Initialize callbacks - callbacks = defaultdict(list) - - # Initialize early stopping callback with EMA - early_stopping_callback = EarlyStoppingCallback( - model=model, - patience=3, - min_delta=0.005, - use_ema=True - ) - callbacks["on_fit_epoch_end"].append(early_stopping_callback.update) - - model.train( - callbacks=callbacks, - epochs=10, - output_dir="test_output", - use_ema=True, - early_stopping=True, - early_stopping_patience=3, - early_stopping_min_delta=0.005, - early_stopping_use_ema=True - ) - -if __name__ == "__main__": - # Make sure the output directory exists - os.makedirs("test_output", exist_ok=True) - - print("\n\nšŸ” SCENARIO 1: Steady improvement, no early stopping") - test_scenario_1() - - print("\n\nšŸ” SCENARIO 2: Early plateau, should trigger early stopping") - test_scenario_2() - - print("\n\nšŸ” SCENARIO 3: Initial improvement then plateau") - test_scenario_3() - - print("\n\nšŸ” SCENARIO 4: Decreasing performance") - test_scenario_4() - - print("\n\nšŸ” SCENARIO 5: Using EMA metrics") - test_scenario_5() \ No newline at end of file From 37b08321de3cd3388952a941d0b4dd5a31a289cd Mon Sep 17 00:00:00 2001 From: Matvei Popov <46304340+Matvezy@users.noreply.github.com> Date: Mon, 31 Mar 2025 15:19:42 -0700 Subject: [PATCH 09/20] Delete rfdetr/test_output/log.txt --- rfdetr/test_output/log.txt | 70 -------------------------------------- 1 file changed, 70 deletions(-) delete mode 100644 rfdetr/test_output/log.txt diff --git a/rfdetr/test_output/log.txt b/rfdetr/test_output/log.txt deleted file mode 100644 index 67e9ee3..0000000 --- a/rfdetr/test_output/log.txt +++ /dev/null @@ -1,70 +0,0 @@ -{'epoch': 0, 'train_loss': 1.0, 'train_class_error': 0.5, 'test_loss': 1.2, 'test_coco_eval_bbox': [0.3, 0.24, 0.0, 0.0, 0.0, 0.0, 0.27], 'n_parameters': 1000000} -{'epoch': 1, 'train_loss': 0.5, 'train_class_error': 0.25, 'test_loss': 0.6, 'test_coco_eval_bbox': [0.32, 0.256, 0.0, 0.0, 0.0, 0.0, 0.28800000000000003], 'n_parameters': 1000000} -{'epoch': 2, 'train_loss': 0.3333333333333333, 'train_class_error': 0.16666666666666666, 'test_loss': 0.39999999999999997, 'test_coco_eval_bbox': [0.34, 0.272, 0.0, 0.0, 0.0, 0.0, 0.30600000000000005], 'n_parameters': 1000000} -{'epoch': 3, 'train_loss': 0.25, 'train_class_error': 0.125, 'test_loss': 0.3, 'test_coco_eval_bbox': [0.36, 0.288, 0.0, 0.0, 0.0, 0.0, 0.324], 'n_parameters': 1000000} -{'epoch': 4, 'train_loss': 0.2, 'train_class_error': 0.1, 'test_loss': 0.24, 'test_coco_eval_bbox': [0.38, 0.30400000000000005, 0.0, 0.0, 0.0, 0.0, 0.342], 'n_parameters': 1000000} -{'epoch': 5, 'train_loss': 0.16666666666666666, 'train_class_error': 0.08333333333333333, 'test_loss': 0.19999999999999998, 'test_coco_eval_bbox': [0.4, 0.32000000000000006, 0.0, 0.0, 0.0, 0.0, 0.36000000000000004], 'n_parameters': 1000000} -{'epoch': 6, 'train_loss': 0.14285714285714285, 'train_class_error': 0.07142857142857142, 'test_loss': 0.17142857142857143, 'test_coco_eval_bbox': [0.42, 0.336, 0.0, 0.0, 0.0, 0.0, 0.378], 'n_parameters': 1000000} -{'epoch': 7, 'train_loss': 0.125, 'train_class_error': 0.0625, 'test_loss': 0.15, 'test_coco_eval_bbox': [0.44, 0.35200000000000004, 0.0, 0.0, 0.0, 0.0, 0.396], 'n_parameters': 1000000} -{'epoch': 8, 'train_loss': 0.1111111111111111, 'train_class_error': 0.05555555555555555, 'test_loss': 0.13333333333333333, 'test_coco_eval_bbox': [0.46, 0.36800000000000005, 0.0, 0.0, 0.0, 0.0, 0.41400000000000003], 'n_parameters': 1000000} -{'epoch': 9, 'train_loss': 0.1, 'train_class_error': 0.05, 'test_loss': 0.12, 'test_coco_eval_bbox': [0.48, 0.384, 0.0, 0.0, 0.0, 0.0, 0.432], 'n_parameters': 1000000} -{'epoch': 0, 'train_loss': 1.0, 'train_class_error': 0.5, 'test_loss': 1.2, 'test_coco_eval_bbox': [0.3, 0.24, 0.0, 0.0, 0.0, 0.0, 0.27], 'n_parameters': 1000000} -{'epoch': 1, 'train_loss': 0.5, 'train_class_error': 0.25, 'test_loss': 0.6, 'test_coco_eval_bbox': [0.32, 0.256, 0.0, 0.0, 0.0, 0.0, 0.28800000000000003], 'n_parameters': 1000000} -{'epoch': 2, 'train_loss': 0.3333333333333333, 'train_class_error': 0.16666666666666666, 'test_loss': 0.39999999999999997, 'test_coco_eval_bbox': [0.34, 0.272, 0.0, 0.0, 0.0, 0.0, 0.30600000000000005], 'n_parameters': 1000000} -{'epoch': 3, 'train_loss': 0.25, 'train_class_error': 0.125, 'test_loss': 0.3, 'test_coco_eval_bbox': [0.341, 0.27280000000000004, 0.0, 0.0, 0.0, 0.0, 0.3069], 'n_parameters': 1000000} -{'epoch': 4, 'train_loss': 0.2, 'train_class_error': 0.1, 'test_loss': 0.24, 'test_coco_eval_bbox': [0.342, 0.2736, 0.0, 0.0, 0.0, 0.0, 0.3078], 'n_parameters': 1000000} -{'epoch': 5, 'train_loss': 0.16666666666666666, 'train_class_error': 0.08333333333333333, 'test_loss': 0.19999999999999998, 'test_coco_eval_bbox': [0.342, 0.2736, 0.0, 0.0, 0.0, 0.0, 0.3078], 'n_parameters': 1000000} -{'epoch': 0, 'train_loss': 1.0, 'train_class_error': 0.5, 'test_loss': 1.2, 'test_coco_eval_bbox': [0.3, 0.24, 0.0, 0.0, 0.0, 0.0, 0.27], 'n_parameters': 1000000} -{'epoch': 1, 'train_loss': 0.5, 'train_class_error': 0.25, 'test_loss': 0.6, 'test_coco_eval_bbox': [0.35, 0.27999999999999997, 0.0, 0.0, 0.0, 0.0, 0.315], 'n_parameters': 1000000} -{'epoch': 2, 'train_loss': 0.3333333333333333, 'train_class_error': 0.16666666666666666, 'test_loss': 0.39999999999999997, 'test_coco_eval_bbox': [0.4, 0.32000000000000006, 0.0, 0.0, 0.0, 0.0, 0.36000000000000004], 'n_parameters': 1000000} -{'epoch': 3, 'train_loss': 0.25, 'train_class_error': 0.125, 'test_loss': 0.3, 'test_coco_eval_bbox': [0.45, 0.36000000000000004, 0.0, 0.0, 0.0, 0.0, 0.405], 'n_parameters': 1000000} -{'epoch': 4, 'train_loss': 0.2, 'train_class_error': 0.1, 'test_loss': 0.24, 'test_coco_eval_bbox': [0.451, 0.3608, 0.0, 0.0, 0.0, 0.0, 0.40590000000000004], 'n_parameters': 1000000} -{'epoch': 5, 'train_loss': 0.16666666666666666, 'train_class_error': 0.08333333333333333, 'test_loss': 0.19999999999999998, 'test_coco_eval_bbox': [0.452, 0.36160000000000003, 0.0, 0.0, 0.0, 0.0, 0.4068], 'n_parameters': 1000000} -{'epoch': 6, 'train_loss': 0.14285714285714285, 'train_class_error': 0.07142857142857142, 'test_loss': 0.17142857142857143, 'test_coco_eval_bbox': [0.452, 0.36160000000000003, 0.0, 0.0, 0.0, 0.0, 0.4068], 'n_parameters': 1000000} -{'epoch': 0, 'train_loss': 1.0, 'train_class_error': 0.5, 'test_loss': 1.2, 'test_coco_eval_bbox': [0.3, 0.24, 0.0, 0.0, 0.0, 0.0, 0.27], 'n_parameters': 1000000} -{'epoch': 1, 'train_loss': 0.5, 'train_class_error': 0.25, 'test_loss': 0.6, 'test_coco_eval_bbox': [0.32, 0.256, 0.0, 0.0, 0.0, 0.0, 0.28800000000000003], 'n_parameters': 1000000} -{'epoch': 2, 'train_loss': 0.3333333333333333, 'train_class_error': 0.16666666666666666, 'test_loss': 0.39999999999999997, 'test_coco_eval_bbox': [0.34, 0.272, 0.0, 0.0, 0.0, 0.0, 0.30600000000000005], 'n_parameters': 1000000} -{'epoch': 3, 'train_loss': 0.25, 'train_class_error': 0.125, 'test_loss': 0.3, 'test_coco_eval_bbox': [0.33, 0.264, 0.0, 0.0, 0.0, 0.0, 0.29700000000000004], 'n_parameters': 1000000} -{'epoch': 4, 'train_loss': 0.2, 'train_class_error': 0.1, 'test_loss': 0.24, 'test_coco_eval_bbox': [0.32, 0.256, 0.0, 0.0, 0.0, 0.0, 0.28800000000000003], 'n_parameters': 1000000} -{'epoch': 5, 'train_loss': 0.16666666666666666, 'train_class_error': 0.08333333333333333, 'test_loss': 0.19999999999999998, 'test_coco_eval_bbox': [0.31, 0.248, 0.0, 0.0, 0.0, 0.0, 0.279], 'n_parameters': 1000000} -{'epoch': 0, 'train_loss': 1.0, 'train_class_error': 0.5, 'test_loss': 1.2, 'test_coco_eval_bbox': [0.3, 0.24, 0.0, 0.0, 0.0, 0.0, 0.27], 'n_parameters': 1000000, 'ema_test_coco_eval_bbox': [0.315, 0.252, 0.0, 0.0, 0.0, 0.0, 0.28350000000000003]} -{'epoch': 1, 'train_loss': 0.5, 'train_class_error': 0.25, 'test_loss': 0.6, 'test_coco_eval_bbox': [0.32, 0.256, 0.0, 0.0, 0.0, 0.0, 0.28800000000000003], 'n_parameters': 1000000, 'ema_test_coco_eval_bbox': [0.336, 0.26880000000000004, 0.0, 0.0, 0.0, 0.0, 0.3024]} -{'epoch': 2, 'train_loss': 0.3333333333333333, 'train_class_error': 0.16666666666666666, 'test_loss': 0.39999999999999997, 'test_coco_eval_bbox': [0.34, 0.272, 0.0, 0.0, 0.0, 0.0, 0.30600000000000005], 'n_parameters': 1000000, 'ema_test_coco_eval_bbox': [0.35700000000000004, 0.2856, 0.0, 0.0, 0.0, 0.0, 0.32130000000000003]} -{'epoch': 3, 'train_loss': 0.25, 'train_class_error': 0.125, 'test_loss': 0.3, 'test_coco_eval_bbox': [0.341, 0.27280000000000004, 0.0, 0.0, 0.0, 0.0, 0.3069], 'n_parameters': 1000000, 'ema_test_coco_eval_bbox': [0.35805000000000003, 0.28644000000000003, 0.0, 0.0, 0.0, 0.0, 0.32224500000000006]} -{'epoch': 4, 'train_loss': 0.2, 'train_class_error': 0.1, 'test_loss': 0.24, 'test_coco_eval_bbox': [0.342, 0.2736, 0.0, 0.0, 0.0, 0.0, 0.3078], 'n_parameters': 1000000, 'ema_test_coco_eval_bbox': [0.35910000000000003, 0.28728000000000004, 0.0, 0.0, 0.0, 0.0, 0.32319000000000003]} -{'epoch': 5, 'train_loss': 0.16666666666666666, 'train_class_error': 0.08333333333333333, 'test_loss': 0.19999999999999998, 'test_coco_eval_bbox': [0.342, 0.2736, 0.0, 0.0, 0.0, 0.0, 0.3078], 'n_parameters': 1000000, 'ema_test_coco_eval_bbox': [0.35910000000000003, 0.28728000000000004, 0.0, 0.0, 0.0, 0.0, 0.32319000000000003]} -{'epoch': 0, 'train_loss': 1.0, 'train_class_error': 0.5, 'test_loss': 1.2, 'test_coco_eval_bbox': [0.3, 0.24, 0.0, 0.0, 0.0, 0.0, 0.27], 'n_parameters': 1000000} -{'epoch': 1, 'train_loss': 0.5, 'train_class_error': 0.25, 'test_loss': 0.6, 'test_coco_eval_bbox': [0.32, 0.256, 0.0, 0.0, 0.0, 0.0, 0.28800000000000003], 'n_parameters': 1000000} -{'epoch': 2, 'train_loss': 0.3333333333333333, 'train_class_error': 0.16666666666666666, 'test_loss': 0.39999999999999997, 'test_coco_eval_bbox': [0.34, 0.272, 0.0, 0.0, 0.0, 0.0, 0.30600000000000005], 'n_parameters': 1000000} -{'epoch': 3, 'train_loss': 0.25, 'train_class_error': 0.125, 'test_loss': 0.3, 'test_coco_eval_bbox': [0.36, 0.288, 0.0, 0.0, 0.0, 0.0, 0.324], 'n_parameters': 1000000} -{'epoch': 4, 'train_loss': 0.2, 'train_class_error': 0.1, 'test_loss': 0.24, 'test_coco_eval_bbox': [0.38, 0.30400000000000005, 0.0, 0.0, 0.0, 0.0, 0.342], 'n_parameters': 1000000} -{'epoch': 5, 'train_loss': 0.16666666666666666, 'train_class_error': 0.08333333333333333, 'test_loss': 0.19999999999999998, 'test_coco_eval_bbox': [0.4, 0.32000000000000006, 0.0, 0.0, 0.0, 0.0, 0.36000000000000004], 'n_parameters': 1000000} -{'epoch': 6, 'train_loss': 0.14285714285714285, 'train_class_error': 0.07142857142857142, 'test_loss': 0.17142857142857143, 'test_coco_eval_bbox': [0.42, 0.336, 0.0, 0.0, 0.0, 0.0, 0.378], 'n_parameters': 1000000} -{'epoch': 7, 'train_loss': 0.125, 'train_class_error': 0.0625, 'test_loss': 0.15, 'test_coco_eval_bbox': [0.44, 0.35200000000000004, 0.0, 0.0, 0.0, 0.0, 0.396], 'n_parameters': 1000000} -{'epoch': 8, 'train_loss': 0.1111111111111111, 'train_class_error': 0.05555555555555555, 'test_loss': 0.13333333333333333, 'test_coco_eval_bbox': [0.46, 0.36800000000000005, 0.0, 0.0, 0.0, 0.0, 0.41400000000000003], 'n_parameters': 1000000} -{'epoch': 9, 'train_loss': 0.1, 'train_class_error': 0.05, 'test_loss': 0.12, 'test_coco_eval_bbox': [0.48, 0.384, 0.0, 0.0, 0.0, 0.0, 0.432], 'n_parameters': 1000000} -{'epoch': 0, 'train_loss': 1.0, 'train_class_error': 0.5, 'test_loss': 1.2, 'test_coco_eval_bbox': [0.3, 0.24, 0.0, 0.0, 0.0, 0.0, 0.27], 'n_parameters': 1000000} -{'epoch': 1, 'train_loss': 0.5, 'train_class_error': 0.25, 'test_loss': 0.6, 'test_coco_eval_bbox': [0.32, 0.256, 0.0, 0.0, 0.0, 0.0, 0.28800000000000003], 'n_parameters': 1000000} -{'epoch': 2, 'train_loss': 0.3333333333333333, 'train_class_error': 0.16666666666666666, 'test_loss': 0.39999999999999997, 'test_coco_eval_bbox': [0.34, 0.272, 0.0, 0.0, 0.0, 0.0, 0.30600000000000005], 'n_parameters': 1000000} -{'epoch': 3, 'train_loss': 0.25, 'train_class_error': 0.125, 'test_loss': 0.3, 'test_coco_eval_bbox': [0.341, 0.27280000000000004, 0.0, 0.0, 0.0, 0.0, 0.3069], 'n_parameters': 1000000} -{'epoch': 4, 'train_loss': 0.2, 'train_class_error': 0.1, 'test_loss': 0.24, 'test_coco_eval_bbox': [0.342, 0.2736, 0.0, 0.0, 0.0, 0.0, 0.3078], 'n_parameters': 1000000} -{'epoch': 5, 'train_loss': 0.16666666666666666, 'train_class_error': 0.08333333333333333, 'test_loss': 0.19999999999999998, 'test_coco_eval_bbox': [0.342, 0.2736, 0.0, 0.0, 0.0, 0.0, 0.3078], 'n_parameters': 1000000} -{'epoch': 0, 'train_loss': 1.0, 'train_class_error': 0.5, 'test_loss': 1.2, 'test_coco_eval_bbox': [0.3, 0.24, 0.0, 0.0, 0.0, 0.0, 0.27], 'n_parameters': 1000000} -{'epoch': 1, 'train_loss': 0.5, 'train_class_error': 0.25, 'test_loss': 0.6, 'test_coco_eval_bbox': [0.35, 0.27999999999999997, 0.0, 0.0, 0.0, 0.0, 0.315], 'n_parameters': 1000000} -{'epoch': 2, 'train_loss': 0.3333333333333333, 'train_class_error': 0.16666666666666666, 'test_loss': 0.39999999999999997, 'test_coco_eval_bbox': [0.4, 0.32000000000000006, 0.0, 0.0, 0.0, 0.0, 0.36000000000000004], 'n_parameters': 1000000} -{'epoch': 3, 'train_loss': 0.25, 'train_class_error': 0.125, 'test_loss': 0.3, 'test_coco_eval_bbox': [0.45, 0.36000000000000004, 0.0, 0.0, 0.0, 0.0, 0.405], 'n_parameters': 1000000} -{'epoch': 4, 'train_loss': 0.2, 'train_class_error': 0.1, 'test_loss': 0.24, 'test_coco_eval_bbox': [0.451, 0.3608, 0.0, 0.0, 0.0, 0.0, 0.40590000000000004], 'n_parameters': 1000000} -{'epoch': 5, 'train_loss': 0.16666666666666666, 'train_class_error': 0.08333333333333333, 'test_loss': 0.19999999999999998, 'test_coco_eval_bbox': [0.452, 0.36160000000000003, 0.0, 0.0, 0.0, 0.0, 0.4068], 'n_parameters': 1000000} -{'epoch': 6, 'train_loss': 0.14285714285714285, 'train_class_error': 0.07142857142857142, 'test_loss': 0.17142857142857143, 'test_coco_eval_bbox': [0.452, 0.36160000000000003, 0.0, 0.0, 0.0, 0.0, 0.4068], 'n_parameters': 1000000} -{'epoch': 0, 'train_loss': 1.0, 'train_class_error': 0.5, 'test_loss': 1.2, 'test_coco_eval_bbox': [0.3, 0.24, 0.0, 0.0, 0.0, 0.0, 0.27], 'n_parameters': 1000000} -{'epoch': 1, 'train_loss': 0.5, 'train_class_error': 0.25, 'test_loss': 0.6, 'test_coco_eval_bbox': [0.32, 0.256, 0.0, 0.0, 0.0, 0.0, 0.28800000000000003], 'n_parameters': 1000000} -{'epoch': 2, 'train_loss': 0.3333333333333333, 'train_class_error': 0.16666666666666666, 'test_loss': 0.39999999999999997, 'test_coco_eval_bbox': [0.34, 0.272, 0.0, 0.0, 0.0, 0.0, 0.30600000000000005], 'n_parameters': 1000000} -{'epoch': 3, 'train_loss': 0.25, 'train_class_error': 0.125, 'test_loss': 0.3, 'test_coco_eval_bbox': [0.33, 0.264, 0.0, 0.0, 0.0, 0.0, 0.29700000000000004], 'n_parameters': 1000000} -{'epoch': 4, 'train_loss': 0.2, 'train_class_error': 0.1, 'test_loss': 0.24, 'test_coco_eval_bbox': [0.32, 0.256, 0.0, 0.0, 0.0, 0.0, 0.28800000000000003], 'n_parameters': 1000000} -{'epoch': 5, 'train_loss': 0.16666666666666666, 'train_class_error': 0.08333333333333333, 'test_loss': 0.19999999999999998, 'test_coco_eval_bbox': [0.31, 0.248, 0.0, 0.0, 0.0, 0.0, 0.279], 'n_parameters': 1000000} -{'epoch': 0, 'train_loss': 1.0, 'train_class_error': 0.5, 'test_loss': 1.2, 'test_coco_eval_bbox': [0.3, 0.24, 0.0, 0.0, 0.0, 0.0, 0.27], 'n_parameters': 1000000, 'ema_test_coco_eval_bbox': [0.315, 0.252, 0.0, 0.0, 0.0, 0.0, 0.28350000000000003]} -{'epoch': 1, 'train_loss': 0.5, 'train_class_error': 0.25, 'test_loss': 0.6, 'test_coco_eval_bbox': [0.32, 0.256, 0.0, 0.0, 0.0, 0.0, 0.28800000000000003], 'n_parameters': 1000000, 'ema_test_coco_eval_bbox': [0.336, 0.26880000000000004, 0.0, 0.0, 0.0, 0.0, 0.3024]} -{'epoch': 2, 'train_loss': 0.3333333333333333, 'train_class_error': 0.16666666666666666, 'test_loss': 0.39999999999999997, 'test_coco_eval_bbox': [0.34, 0.272, 0.0, 0.0, 0.0, 0.0, 0.30600000000000005], 'n_parameters': 1000000, 'ema_test_coco_eval_bbox': [0.35700000000000004, 0.2856, 0.0, 0.0, 0.0, 0.0, 0.32130000000000003]} -{'epoch': 3, 'train_loss': 0.25, 'train_class_error': 0.125, 'test_loss': 0.3, 'test_coco_eval_bbox': [0.341, 0.27280000000000004, 0.0, 0.0, 0.0, 0.0, 0.3069], 'n_parameters': 1000000, 'ema_test_coco_eval_bbox': [0.35805000000000003, 0.28644000000000003, 0.0, 0.0, 0.0, 0.0, 0.32224500000000006]} -{'epoch': 4, 'train_loss': 0.2, 'train_class_error': 0.1, 'test_loss': 0.24, 'test_coco_eval_bbox': [0.342, 0.2736, 0.0, 0.0, 0.0, 0.0, 0.3078], 'n_parameters': 1000000, 'ema_test_coco_eval_bbox': [0.35910000000000003, 0.28728000000000004, 0.0, 0.0, 0.0, 0.0, 0.32319000000000003]} -{'epoch': 5, 'train_loss': 0.16666666666666666, 'train_class_error': 0.08333333333333333, 'test_loss': 0.19999999999999998, 'test_coco_eval_bbox': [0.342, 0.2736, 0.0, 0.0, 0.0, 0.0, 0.3078], 'n_parameters': 1000000, 'ema_test_coco_eval_bbox': [0.35910000000000003, 0.28728000000000004, 0.0, 0.0, 0.0, 0.0, 0.32319000000000003]} From 52bf804e1300a57ceb60b81007acc0741bbb9a91 Mon Sep 17 00:00:00 2001 From: Matvezy Date: Mon, 31 Mar 2025 22:29:40 +0000 Subject: [PATCH 10/20] fix callback saving --- rfdetr/util/misc.py | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/rfdetr/util/misc.py b/rfdetr/util/misc.py index 9169317..2587fcf 100644 --- a/rfdetr/util/misc.py +++ b/rfdetr/util/misc.py @@ -422,25 +422,11 @@ def save_on_master(obj, f, *args, **kwargs): Safely save objects, removing any callbacks that can't be pickled """ if is_main_process(): - try: - if isinstance(obj, dict): - obj_copy = {} - for k, v in obj.items(): - if k == 'args' and hasattr(v, '__dict__'): - args_dict = copy.copy(v.__dict__) - if 'callbacks' in args_dict: - del args_dict['callbacks'] - obj_copy[k] = argparse.Namespace(**args_dict) - elif k != 'callbacks': - obj_copy[k] = v - obj = obj_copy - - torch.save(obj, f, *args, **kwargs) - except Exception as e: - print(f"Error in safe_save_on_master: {e}") - if isinstance(obj, dict) and 'model' in obj: - print("Falling back to saving only model state_dict") - torch.save({'model': obj['model']}, f, *args, **kwargs) + if isinstance(obj, dict) and 'model' in obj: + print("Falling back to saving only model state_dict") + torch.save({'model': obj['model']}, f, *args, **kwargs) + else: + raise ValueError("Invalid object type for saving") def init_distributed_mode(args): From c4c4197abcca81c07fd6ad2f5a22b3a7c13b505b Mon Sep 17 00:00:00 2001 From: Matvezy Date: Mon, 31 Mar 2025 23:03:54 +0000 Subject: [PATCH 11/20] fix bug --- rfdetr/config.py | 2 +- rfdetr/main.py | 29 +++++++++-------------------- rfdetr/util/early_stopping.py | 2 ++ 3 files changed, 12 insertions(+), 21 deletions(-) diff --git a/rfdetr/config.py b/rfdetr/config.py index 86079b0..ec06c7d 100644 --- a/rfdetr/config.py +++ b/rfdetr/config.py @@ -70,7 +70,7 @@ class TrainConfig(BaseModel): use_ema: bool = True num_workers: int = 2 weight_decay: float = 1e-4 - early_stopping: bool = False + early_stopping: bool = True early_stopping_patience: int = 5 early_stopping_min_delta: float = 0.001 early_stopping_use_ema: bool = False diff --git a/rfdetr/main.py b/rfdetr/main.py index 04527e9..457f4d1 100644 --- a/rfdetr/main.py +++ b/rfdetr/main.py @@ -29,21 +29,21 @@ import torch from torch.utils.data import DataLoader, DistributedSampler -from rfdetr.datasets import build_dataset, get_coco_api_from_dataset -from rfdetr.engine import evaluate, train_one_epoch -from rfdetr.models import build_model, build_criterion_and_postprocessors -from rfdetr.util.drop_scheduler import drop_scheduler -from rfdetr.util.get_param_dicts import get_param_dict -import rfdetr.util.misc as utils -from rfdetr.util.utils import ModelEma, BestMetricHolder, clean_state_dict -from rfdetr.util.benchmark import benchmark +from datasets import build_dataset, get_coco_api_from_dataset +from engine import evaluate, train_one_epoch +from models import build_model, build_criterion_and_postprocessors +from util.drop_scheduler import drop_scheduler +from util.get_param_dicts import get_param_dict +import util.misc as utils +from util.utils import ModelEma, BestMetricHolder, clean_state_dict +from util.benchmark import benchmark from torch import nn import torch.nn.functional as F from peft import LoraConfig, get_peft_model from typing import DefaultDict, List, Callable from logging import getLogger import shutil -from rfdetr.util.files import download_file +from util.files import download_file import os if os.environ.get("USE_FILE_SYSTEM_SHARING", "0") == "1": import torch.multiprocessing @@ -158,17 +158,6 @@ def train(self, callbacks: DefaultDict[str, List[Callable]], **kwargs): print("git:\n {}\n".format(utils.get_sha())) print(args) device = torch.device(args.device) - - # Initialize early stopping if enabled - if args.early_stopping: - from rfdetr.util.early_stopping import EarlyStoppingCallback - early_stopping_callback = EarlyStoppingCallback( - patience=args.early_stopping_patience, - min_delta=args.early_stopping_min_delta, - use_ema=args.early_stopping_use_ema - ) - early_stopping_callback.set_model(self) - callbacks["on_fit_epoch_end"].append(early_stopping_callback.update) # fix the seed for reproducibility seed = args.seed + utils.get_rank() diff --git a/rfdetr/util/early_stopping.py b/rfdetr/util/early_stopping.py index 413dc58..aab1ca9 100644 --- a/rfdetr/util/early_stopping.py +++ b/rfdetr/util/early_stopping.py @@ -35,6 +35,8 @@ def update(self, log_stats): return # Check if current mAP is better than best so far (by at least min_delta) + print(f"DIFF: {current_map - self.best_map}") + print(f"MIN_DELTA: {self.min_delta}") if current_map > self.best_map + self.min_delta: # We have an improvement self.best_map = current_map From eca71c101161f03f52b6d78fc59e5ba49208375d Mon Sep 17 00:00:00 2001 From: Matvezy Date: Mon, 31 Mar 2025 23:05:42 +0000 Subject: [PATCH 12/20] import fix --- rfdetr/main.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/rfdetr/main.py b/rfdetr/main.py index 457f4d1..3c02b0d 100644 --- a/rfdetr/main.py +++ b/rfdetr/main.py @@ -29,21 +29,21 @@ import torch from torch.utils.data import DataLoader, DistributedSampler -from datasets import build_dataset, get_coco_api_from_dataset -from engine import evaluate, train_one_epoch -from models import build_model, build_criterion_and_postprocessors -from util.drop_scheduler import drop_scheduler -from util.get_param_dicts import get_param_dict -import util.misc as utils -from util.utils import ModelEma, BestMetricHolder, clean_state_dict -from util.benchmark import benchmark +from rfdetr.datasets import build_dataset, get_coco_api_from_dataset +from rfdetr.engine import evaluate, train_one_epoch +from rfdetr.models import build_model, build_criterion_and_postprocessors +from rfdetr.util.drop_scheduler import drop_scheduler +from rfdetr.util.get_param_dicts import get_param_dict +import rfdetr.util.misc as utils +from rfdetr.util.utils import ModelEma, BestMetricHolder, clean_state_dict +from rfdetr.util.benchmark import benchmark from torch import nn import torch.nn.functional as F from peft import LoraConfig, get_peft_model from typing import DefaultDict, List, Callable from logging import getLogger import shutil -from util.files import download_file +from rfdetr.util.files import download_file import os if os.environ.get("USE_FILE_SYSTEM_SHARING", "0") == "1": import torch.multiprocessing From d25f5e19399f9780fea5232debb613478895c58e Mon Sep 17 00:00:00 2001 From: Matvezy Date: Mon, 31 Mar 2025 23:33:58 +0000 Subject: [PATCH 13/20] updated based on max ema or regular --- rfdetr/util/early_stopping.py | 48 +++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/rfdetr/util/early_stopping.py b/rfdetr/util/early_stopping.py index aab1ca9..a6984ea 100644 --- a/rfdetr/util/early_stopping.py +++ b/rfdetr/util/early_stopping.py @@ -25,33 +25,49 @@ def __init__(self, model, patience=5, min_delta=0.001, use_ema=False, verbose=Tr def update(self, log_stats): """Update early stopping state based on epoch validation metrics""" - # Get the mAP value from the log stats - if self.use_ema and 'ema_test_coco_eval_bbox' in log_stats: - current_map = log_stats['ema_test_coco_eval_bbox'][0] - elif 'test_coco_eval_bbox' in log_stats: - current_map = log_stats['test_coco_eval_bbox'][0] + regular_map = None + ema_map = None + + if 'test_coco_eval_bbox' in log_stats: + regular_map = log_stats['test_coco_eval_bbox'][0] + + if 'ema_test_coco_eval_bbox' in log_stats: + ema_map = log_stats['ema_test_coco_eval_bbox'][0] + + current_map = None + if regular_map is not None and ema_map is not None: + if self.use_ema: + current_map = ema_map + metric_source = "EMA" + else: + current_map = max(regular_map, ema_map) + metric_source = "max(regular, EMA)" + elif ema_map is not None: + current_map = ema_map + metric_source = "EMA" + elif regular_map is not None: + current_map = regular_map + metric_source = "regular" else: - # No valid mAP metric found, skip early stopping check + if self.verbose: + print("Early stopping: No valid mAP metric found, skipping check") return - # Check if current mAP is better than best so far (by at least min_delta) - print(f"DIFF: {current_map - self.best_map}") - print(f"MIN_DELTA: {self.min_delta}") + if self.verbose: + print(f"Early stopping: Current mAP ({metric_source}): {current_map:.4f}, Best: {self.best_map:.4f}, Diff: {current_map - self.best_map:.4f}, Min delta: {self.min_delta}") + if current_map > self.best_map + self.min_delta: - # We have an improvement self.best_map = current_map self.counter = 0 if self.verbose: - print(f"Early stopping: mAP improved to {current_map:.4f}") + print(f"Early stopping: mAP improved to {current_map:.4f} using {metric_source} metric") else: - # No improvement self.counter += 1 if self.verbose: print(f"Early stopping: No improvement in mAP for {self.counter} epochs (best: {self.best_map:.4f}, current: {current_map:.4f})") - # Check if early stopping criteria met - if self.counter >= self.patience: - print(f"Early stopping triggered: No improvement above {self.min_delta} threshold for {self.patience} epochs") - # Request model to stop early + if self.counter >= self.patience: + print(f"Early stopping triggered: No improvement above {self.min_delta} threshold for {self.patience} epochs") + if self.model: if self.model: self.model.request_early_stop() \ No newline at end of file From 2050872912deaf66d9c28f1f87833c1bd6732e9d Mon Sep 17 00:00:00 2001 From: Matvezy Date: Mon, 31 Mar 2025 23:35:34 +0000 Subject: [PATCH 14/20] updated based on max ema or regular --- rfdetr/util/early_stopping.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/rfdetr/util/early_stopping.py b/rfdetr/util/early_stopping.py index a6984ea..db7dc9f 100644 --- a/rfdetr/util/early_stopping.py +++ b/rfdetr/util/early_stopping.py @@ -69,5 +69,4 @@ def update(self, log_stats): if self.counter >= self.patience: print(f"Early stopping triggered: No improvement above {self.min_delta} threshold for {self.patience} epochs") if self.model: - if self.model: - self.model.request_early_stop() \ No newline at end of file + self.model.request_early_stop() \ No newline at end of file From 6b8e1f89aa44aeb3def7b7fa16b42f3007ef8a60 Mon Sep 17 00:00:00 2001 From: Matvezy Date: Mon, 31 Mar 2025 23:59:45 +0000 Subject: [PATCH 15/20] filesystem --- rfdetr/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rfdetr/main.py b/rfdetr/main.py index 3c02b0d..780cf75 100644 --- a/rfdetr/main.py +++ b/rfdetr/main.py @@ -45,7 +45,7 @@ import shutil from rfdetr.util.files import download_file import os -if os.environ.get("USE_FILE_SYSTEM_SHARING", "0") == "1": +if str(os.environ.get("USE_FILE_SYSTEM_SHARING", "False")).lower() in ["true", "1"]: import torch.multiprocessing torch.multiprocessing.set_sharing_strategy('file_system') From 49c2c5e735267f69f8e7743059882994111bc3f4 Mon Sep 17 00:00:00 2001 From: Matvezy Date: Tue, 1 Apr 2025 00:13:02 +0000 Subject: [PATCH 16/20] default 10 steps --- rfdetr/config.py | 2 +- rfdetr/main.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/rfdetr/config.py b/rfdetr/config.py index ec06c7d..3f973e1 100644 --- a/rfdetr/config.py +++ b/rfdetr/config.py @@ -71,6 +71,6 @@ class TrainConfig(BaseModel): num_workers: int = 2 weight_decay: float = 1e-4 early_stopping: bool = True - early_stopping_patience: int = 5 + early_stopping_patience: int = 10 early_stopping_min_delta: float = 0.001 early_stopping_use_ema: bool = False diff --git a/rfdetr/main.py b/rfdetr/main.py index 780cf75..52d954d 100644 --- a/rfdetr/main.py +++ b/rfdetr/main.py @@ -755,7 +755,7 @@ def get_args_parser(): # Early stopping parameters parser.add_argument('--early_stopping', action='store_true', help='Enable early stopping based on mAP improvement') - parser.add_argument('--early_stopping_patience', default=5, type=int, + parser.add_argument('--early_stopping_patience', default=10, type=int, help='Number of epochs with no improvement after which training will be stopped') parser.add_argument('--early_stopping_min_delta', default=0.001, type=float, help='Minimum change in mAP to qualify as an improvement') @@ -893,7 +893,7 @@ def populate_args( lr_min_factor=0.0, # Early stopping parameters early_stopping=True, - early_stopping_patience=5, + early_stopping_patience=10, early_stopping_min_delta=0.001, early_stopping_use_ema=False, # Additional From f8a5664a1593378db909e552ba9e0ecea10dc24b Mon Sep 17 00:00:00 2001 From: Matvezy Date: Tue, 1 Apr 2025 00:52:17 +0000 Subject: [PATCH 17/20] drop redundnant log --- rfdetr/util/misc.py | 1 - 1 file changed, 1 deletion(-) diff --git a/rfdetr/util/misc.py b/rfdetr/util/misc.py index 2587fcf..2b50d30 100644 --- a/rfdetr/util/misc.py +++ b/rfdetr/util/misc.py @@ -423,7 +423,6 @@ def save_on_master(obj, f, *args, **kwargs): """ if is_main_process(): if isinstance(obj, dict) and 'model' in obj: - print("Falling back to saving only model state_dict") torch.save({'model': obj['model']}, f, *args, **kwargs) else: raise ValueError("Invalid object type for saving") From b67bf6e24a9d5ef0766ae55b928af0b3ad48b014 Mon Sep 17 00:00:00 2001 From: Matvezy Date: Tue, 1 Apr 2025 21:03:17 +0000 Subject: [PATCH 18/20] pull changes --- rfdetr/util/early_stopping.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/rfdetr/util/early_stopping.py b/rfdetr/util/early_stopping.py index db7dc9f..30bf888 100644 --- a/rfdetr/util/early_stopping.py +++ b/rfdetr/util/early_stopping.py @@ -2,6 +2,10 @@ Early stopping callback for RF-DETR training """ +from logging import getLogger + +logger = getLogger(__name__) + class EarlyStoppingCallback: """ Early stopping callback that monitors mAP and stops training if no improvement @@ -50,7 +54,7 @@ def update(self, log_stats): metric_source = "regular" else: if self.verbose: - print("Early stopping: No valid mAP metric found, skipping check") + raise ValueError("No valid mAP metric found!") return if self.verbose: @@ -59,8 +63,7 @@ def update(self, log_stats): if current_map > self.best_map + self.min_delta: self.best_map = current_map self.counter = 0 - if self.verbose: - print(f"Early stopping: mAP improved to {current_map:.4f} using {metric_source} metric") + logger.info(f"Early stopping: mAP improved to {current_map:.4f} using {metric_source} metric") else: self.counter += 1 if self.verbose: From 88de76391c73aafa78f11e19ac9edf3984e98b9d Mon Sep 17 00:00:00 2001 From: Matvezy Date: Tue, 1 Apr 2025 22:45:14 +0000 Subject: [PATCH 19/20] fix merging and model saving --- rfdetr/main.py | 1 + rfdetr/util/misc.py | 6 +----- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/rfdetr/main.py b/rfdetr/main.py index f4e24e8..8b683dd 100644 --- a/rfdetr/main.py +++ b/rfdetr/main.py @@ -892,6 +892,7 @@ def populate_args( early_stopping_patience=10, early_stopping_min_delta=0.001, early_stopping_use_ema=False, + gradient_checkpointing=False, # Additional subcommand=None, **extra_kwargs # To handle any unexpected arguments diff --git a/rfdetr/util/misc.py b/rfdetr/util/misc.py index 1a077bb..e73bbe0 100644 --- a/rfdetr/util/misc.py +++ b/rfdetr/util/misc.py @@ -425,11 +425,7 @@ def save_on_master(obj, f, *args, **kwargs): Safely save objects, removing any callbacks that can't be pickled """ if is_main_process(): - if isinstance(obj, dict) and 'model' in obj: - torch.save({'model': obj['model']}, f, *args, **kwargs) - else: - raise ValueError("Invalid object type for saving") - + torch.save(obj, f, *args, **kwargs) def init_distributed_mode(args): if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: From 575abe509845f84c884427f31faba413c8d1d95d Mon Sep 17 00:00:00 2001 From: Piotr Skalski Date: Wed, 2 Apr 2025 17:02:13 +0200 Subject: [PATCH 20/20] Update rfdetr/config.py --- rfdetr/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rfdetr/config.py b/rfdetr/config.py index 5b9316f..35745b6 100644 --- a/rfdetr/config.py +++ b/rfdetr/config.py @@ -71,7 +71,7 @@ class TrainConfig(BaseModel): use_ema: bool = True num_workers: int = 2 weight_decay: float = 1e-4 - early_stopping: bool = True + early_stopping: bool = False early_stopping_patience: int = 10 early_stopping_min_delta: float = 0.001 early_stopping_use_ema: bool = False