diff --git a/official/vision/beta/projects/yolo/__init__.py b/official/vision/beta/projects/yolo/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/official/vision/beta/projects/yolo/common/registry_imports.py b/official/vision/beta/projects/yolo/common/registry_imports.py
index e40d39856a7..56349093624 100644
--- a/official/vision/beta/projects/yolo/common/registry_imports.py
+++ b/official/vision/beta/projects/yolo/common/registry_imports.py
@@ -17,20 +17,16 @@
 # pylint: disable=unused-import
 # pylint: disable=g-bad-import-order
 from official.common import registry_imports
-
 # import configs
 from official.vision.beta.projects.yolo.configs import darknet_classification
 from official.vision.beta.projects.yolo.configs import yolo as yolo_config
-
 # import modeling components
 from official.vision.beta.projects.yolo.modeling.backbones import darknet
 from official.vision.beta.projects.yolo.modeling.decoders import yolo_decoder
-
+# import optimization packages
+from official.vision.beta.projects.yolo.optimization import optimizer_factory
+from official.vision.beta.projects.yolo.optimization.configs import (
+    optimization_config, optimizer_config)
 # import tasks
 from official.vision.beta.projects.yolo.tasks import image_classification
 from official.vision.beta.projects.yolo.tasks import yolo as yolo_task
-
-# import optimization packages
-from official.vision.beta.projects.yolo.optimization import optimizer_factory
-from official.vision.beta.projects.yolo.optimization.configs import optimizer_config
-from official.vision.beta.projects.yolo.optimization.configs import optimization_config
diff --git a/official/vision/beta/projects/yolo/configs/backbones.py b/official/vision/beta/projects/yolo/configs/backbones.py
index 071af5bdef7..476e8e6ee4a 100644
--- a/official/vision/beta/projects/yolo/configs/backbones.py
+++ b/official/vision/beta/projects/yolo/configs/backbones.py
@@ -14,6 +14,7 @@
 
 """Backbones configurations."""
 import dataclasses
+
 from official.modeling import hyperparams
 from official.vision.beta.configs import backbones
 
diff --git a/official/vision/beta/projects/yolo/configs/decoders.py b/official/vision/beta/projects/yolo/configs/decoders.py
index bc96e1b77a6..48a8c542b34 100755
--- a/official/vision/beta/projects/yolo/configs/decoders.py
+++ b/official/vision/beta/projects/yolo/configs/decoders.py
@@ -15,6 +15,7 @@
 """Decoders configurations."""
 import dataclasses
 from typing import Optional
+
 from official.modeling import hyperparams
 from official.vision.beta.configs import decoders
 
@@ -33,6 +34,8 @@ class YoloDecoder(hyperparams.Config):
   use_separable_conv: bool = False
   csp_stack: Optional[bool] = None
   fpn_depth: Optional[int] = None
+  max_fpn_depth: Optional[int] = None
+  max_csp_stack: Optional[int] = None
   fpn_filter_scale: Optional[int] = None
   path_process_len: Optional[int] = None
   max_level_process_len: Optional[int] = None
diff --git a/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection-finetune/yolo_csp_640_tpu.yaml b/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection-finetune/yolo_csp_640_tpu.yaml
new file mode 100644
index 00000000000..844c1f5b9d0
--- /dev/null
+++ b/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection-finetune/yolo_csp_640_tpu.yaml
@@ -0,0 +1,80 @@
+# --experiment_type=scaled_yolo
+# mAP 47.6
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'float32'
+  tpu_enable_xla_dynamic_padder: false
+task:
+  model:
+    input_size: [640, 640, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'altered_cspdarknet53'
+        max_level: 5
+        min_level: 3
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: csp
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': scaled
+      scale_xy:
+        'all': 2.0
+      max_boxes: 300
+      nms_type: iou
+      iou_thresh: 0.001
+      nms_thresh: 0.65
+    loss:
+      use_scaled_loss: true
+      update_on_repeat: true
+      box_loss_type:
+        'all': ciou
+      ignore_thresh:
+        'all': 0.0
+      iou_normalizer:
+        'all': 0.05
+      cls_normalizer:
+        'all': 0.3
+      object_normalizer:
+        '5': 0.28
+        '4': 0.70
+        '3': 2.80
+      objectness_smooth:
+        'all': 1.0
+    norm_activation:
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 3
+      boxes: [box: [12, 16], box: [19, 36], box: [40, 28],
+              box: [36, 75], box: [76, 55], box: [72, 146],
+              box: [142, 110], box: [192, 243], box: [459, 401]]
+  train_data:
+    input_path: 'gs://cam2-datasets/coco/train*'
+    shuffle_buffer_size: 10000
+    parser:
+      mosaic:
+        mosaic_frequency: 1.0
+        mixup_frequency: 0.2
+        mosaic_crop_mode: 'scale'
+        mosaic_center: 0.25
+        aug_scale_min: 0.1
+        aug_scale_max: 1.9
+      max_num_instances: 300
+      letter_box: true
+      random_flip: true
+      aug_rand_translate: 0.1
+      area_thresh: 0.1
+  validation_data:
+    input_path: 'gs://cam2-datasets/coco/val*'
+trainer:
+  train_steps: 831600 # epoch 300 to 450 
+  optimizer_config:
+    learning_rate:
+      cosine:
+        decay_steps: 831600 # epoch 300 to 450 
\ No newline at end of file
diff --git a/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection-finetune/yolo_l_p5_896_tpu.yaml b/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection-finetune/yolo_l_p5_896_tpu.yaml
new file mode 100644
index 00000000000..a520acf4e91
--- /dev/null
+++ b/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection-finetune/yolo_l_p5_896_tpu.yaml
@@ -0,0 +1,82 @@
+# --experiment_type=large_yolo_finetune
+# mAP 51.1%
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'float32'
+  tpu_enable_xla_dynamic_padder: false
+task:
+  model:
+    input_size: [896, 896, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'csp-large'
+        max_level: 5
+        min_level: 3
+        width_scale: 1.00
+        depth_scale: 1.00
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: csp_large
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': scaled
+      scale_xy:
+        'all': 2.0
+      max_boxes: 300
+      nms_type: iou
+      iou_thresh: 0.001
+      nms_thresh: 0.65
+    loss:
+      use_scaled_loss: true
+      update_on_repeat: true
+      box_loss_type:  
+        'all': ciou
+      ignore_thresh:
+        'all': 0.0
+      iou_normalizer: 
+        'all': 0.05
+      cls_normalizer: 
+        'all': 0.5
+      object_normalizer: 
+        '5': 0.4
+        '4': 1.0
+        '3': 4.0
+      objectness_smooth: 
+        'all': 1.0
+    norm_activation:
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 4
+      boxes: [box: [13,17], box: [31,25], box: [24,51], box: [61,45], 
+              box: [48,102], box: [119,96], box: [97,189], box: [217,184], 
+              box: [171,384], box: [324,451],  box: [616,618], box: [800,800]]
+  train_data:
+    input_path: 'gs://cam2-datasets/coco/train*'
+    shuffle_buffer_size: 10000
+    parser:
+      mosaic:
+        mosaic_frequency: 1.0
+        mixup_frequency: 0.2
+        mosaic_crop_mode: 'scale'
+        mosaic_center: 0.0
+        aug_scale_min: 0.2
+        aug_scale_max: 1.8
+      max_num_instances: 300
+      letter_box: true
+      random_flip: true
+      aug_rand_translate: 0.5
+      area_thresh: 0.1
+  validation_data:
+    input_path: 'gs://cam2-datasets/coco/val*'
+trainer:
+  train_steps: 831600 # epoch 300 to 450 
+  optimizer_config:
+    learning_rate:
+      cosine:
+        decay_steps: 831600 # epoch 300 to 450 
\ No newline at end of file
diff --git a/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection-finetune/yolo_l_p6_1280_tpu.yaml b/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection-finetune/yolo_l_p6_1280_tpu.yaml
new file mode 100644
index 00000000000..b10bace12d7
--- /dev/null
+++ b/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection-finetune/yolo_l_p6_1280_tpu.yaml
@@ -0,0 +1,83 @@
+# --experiment_type=large_yolo_finetune
+# mAP 54.4%
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'float32'
+  tpu_enable_xla_dynamic_padder: false
+task:
+  model:
+    input_size: [1280, 1280, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'csp-large'
+        max_level: 6
+        min_level: 3
+        width_scale: 1.00
+        depth_scale: 1.00
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: csp_large
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': scaled
+      scale_xy:
+        'all': 2.0
+      max_boxes: 300
+      nms_type: iou
+      iou_thresh: 0.001
+      nms_thresh: 0.65
+    loss:
+      use_scaled_loss: true
+      update_on_repeat: true
+      box_loss_type:  
+        'all': ciou
+      ignore_thresh:
+        'all': 0.0
+      iou_normalizer: 
+        'all': 0.05
+      cls_normalizer: 
+        'all': 0.5
+      object_normalizer: 
+        '6': 0.1
+        '5': 0.4
+        '4': 1.0
+        '3': 4.0
+      objectness_smooth: 
+        'all': 1.0
+    norm_activation:
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 4
+      boxes:  [box: [13,17], box: [31,25], box: [24,51], box: [61,45], 
+              box: [61,45], box: [48,102], box: [119,96], box: [97,189], 
+              box: [97,189], box: [217,184], box: [171,384], box: [324,451], 
+              box: [324,451], box: [545,357], box: [616,618], box: [1024,1024]]
+  train_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/train*'
+    shuffle_buffer_size: 10000
+    parser:
+      mosaic:
+        mosaic_frequency: 1.0
+        mixup_frequency: 0.2
+        mosaic_crop_mode: 'scale'
+        mosaic_center: 0.0
+        aug_scale_min: 0.2
+        aug_scale_max: 1.8
+      max_num_instances: 300
+      letter_box: true
+      random_flip: true
+      aug_rand_translate: 0.5
+      area_thresh: 0.1
+  validation_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/val*'
+  trainer:
+    train_steps: 831600 # epoch 300 to 450 
+    learning_rate:
+      cosine:
+        decay_steps: 831600 # epoch 300 to 450 
\ No newline at end of file
diff --git a/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection-finetune/yolo_l_p7_1536_tpu.yaml b/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection-finetune/yolo_l_p7_1536_tpu.yaml
new file mode 100644
index 00000000000..a28c691683b
--- /dev/null
+++ b/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection-finetune/yolo_l_p7_1536_tpu.yaml
@@ -0,0 +1,85 @@
+# --experiment_type=large_yolo
+# mAP 55.3%
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'float32'
+  tpu_enable_xla_dynamic_padder: false
+task:
+  model:
+    input_size: [1536, 1536, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'csp-large'
+        max_level: 7
+        min_level: 3
+        width_scale: 1.25
+        depth_scale: 1.00
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: csp_large
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': scaled
+      scale_xy:
+        'all': 2.0
+      max_boxes: 300
+      nms_type: iou
+      iou_thresh: 0.001
+      nms_thresh: 0.65
+    loss:
+      use_scaled_loss: true
+      update_on_repeat: true
+      box_loss_type:  
+        'all': ciou
+      ignore_thresh:
+        'all': 0.0
+      iou_normalizer: 
+        'all': 0.05
+      cls_normalizer: 
+        'all': 0.5
+      object_normalizer: 
+        '7': 0.1
+        '6': 0.4
+        '5': 0.5
+        '4': 1.0
+        '3': 4.0
+      objectness_smooth: 
+        'all': 1.0
+    norm_activation:
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 4
+      boxes:  [box: [13,17], box: [22,25], box: [55,41], box: [27,66],
+              box: [57,88], box: [112,69], box: [69,177], box: [136,138],  
+              box: [136,138], box: [287,114], box: [134,275], box: [268,248],
+              box: [268,248], box: [232,504], box: [445,416], box: [640,640], 
+              box: [812,393], box: [477,808], box: [1070,908], box: [1408,1408]]
+  train_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/train*'
+    shuffle_buffer_size: 10000
+    parser:
+      mosaic:
+        mosaic_frequency: 1.0
+        mixup_frequency: 0.2
+        mosaic_crop_mode: 'scale'
+        mosaic_center: 0.0
+        aug_scale_min: 0.2
+        aug_scale_max: 1.8
+      max_num_instances: 300
+      letter_box: true
+      random_flip: true
+      aug_rand_translate: 0.5
+      area_thresh: 0.1
+  validation_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/val*'
+  trainer:
+    train_steps: 831600 # epoch 300 to 450 
+    learning_rate:
+      cosine:
+        decay_steps: 831600 # epoch 300 to 450 
\ No newline at end of file
diff --git a/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_csp_640_tpu.yaml b/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_csp_640_tpu.yaml
new file mode 100644
index 00000000000..e4a00b1d8c9
--- /dev/null
+++ b/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_csp_640_tpu.yaml
@@ -0,0 +1,74 @@
+# --experiment_type=scaled_yolo
+# mAP 47.6
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'float32'
+  tpu_enable_xla_dynamic_padder: false
+task:
+  model:
+    input_size: [640, 640, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'altered_cspdarknet53'
+        max_level: 5
+        min_level: 3
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: csp
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': scaled
+      scale_xy:
+        'all': 2.0
+      max_boxes: 300
+      nms_type: iou
+      iou_thresh: 0.001
+      nms_thresh: 0.65
+    loss:
+      use_scaled_loss: true
+      update_on_repeat: true
+      box_loss_type:
+        'all': ciou
+      ignore_thresh:
+        'all': 0.0
+      iou_normalizer:
+        'all': 0.05
+      cls_normalizer:
+        'all': 0.3
+      object_normalizer:
+        '5': 0.28
+        '4': 0.70
+        '3': 2.80
+      objectness_smooth:
+        'all': 1.0
+    norm_activation:
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 3
+      boxes: [box: [12, 16], box: [19, 36], box: [40, 28],
+              box: [36, 75], box: [76, 55], box: [72, 146],
+              box: [142, 110], box: [192, 243], box: [459, 401]]
+  train_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/train*'
+    shuffle_buffer_size: 10000
+    parser:
+      mosaic:
+        mosaic_frequency: 1.0
+        mixup_frequency: 0.0
+        mosaic_crop_mode: 'scale'
+        mosaic_center: 0.25
+        aug_scale_min: 0.1
+        aug_scale_max: 1.9
+      max_num_instances: 300
+      letter_box: true
+      random_flip: true
+      aug_rand_translate: 0.1
+      area_thresh: 0.1
+  validation_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/val*'
diff --git a/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_l_p5_896_tpu.yaml b/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_l_p5_896_tpu.yaml
new file mode 100644
index 00000000000..0d5eb4efa5d
--- /dev/null
+++ b/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_l_p5_896_tpu.yaml
@@ -0,0 +1,76 @@
+# --experiment_type=large_yolo
+# mAP 50.5%
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'float32'
+  tpu_enable_xla_dynamic_padder: false
+task:
+  model:
+    input_size: [896, 896, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'csp-large'
+        max_level: 5
+        min_level: 3
+        width_scale: 1.00
+        depth_scale: 1.00
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: csp_large
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': scaled
+      scale_xy:
+        'all': 2.0
+      max_boxes: 300
+      nms_type: iou
+      iou_thresh: 0.001
+      nms_thresh: 0.65
+    loss:
+      use_scaled_loss: true
+      update_on_repeat: true
+      box_loss_type:  
+        'all': ciou
+      ignore_thresh:
+        'all': 0.0
+      iou_normalizer: 
+        'all': 0.05
+      cls_normalizer: 
+        'all': 0.5
+      object_normalizer: 
+        '5': 0.4
+        '4': 1.0
+        '3': 4.0
+      objectness_smooth: 
+        'all': 1.0
+    norm_activation:
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 4
+      boxes: [box: [13,17], box: [31,25], box: [24,51], box: [61,45], 
+              box: [48,102], box: [119,96], box: [97,189], box: [217,184], 
+              box: [171,384], box: [324,451],  box: [616,618], box: [800,800]]
+  train_data:
+    input_path: 'gs://cam2-datasets/coco/train*'
+    shuffle_buffer_size: 10000
+    parser:
+      mosaic:
+        mosaic_frequency: 1.0
+        mixup_frequency: 0.0
+        mosaic_crop_mode: 'scale'
+        mosaic_center: 0.0
+        aug_scale_min: 0.5
+        aug_scale_max: 1.5
+      max_num_instances: 300
+      letter_box: true
+      random_flip: true
+      aug_rand_translate: 0.5
+      area_thresh: 0.1
+  validation_data:
+    input_path: 'gs://cam2-datasets/coco/val*'
diff --git a/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_l_p6_1280_tpu.yaml b/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_l_p6_1280_tpu.yaml
new file mode 100644
index 00000000000..f1f8262199d
--- /dev/null
+++ b/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_l_p6_1280_tpu.yaml
@@ -0,0 +1,78 @@
+# --experiment_type=large_yolo
+# mAP 53.4%
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'float32'
+  tpu_enable_xla_dynamic_padder: false
+task:
+  model:
+    input_size: [1280, 1280, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'csp-large'
+        max_level: 6
+        min_level: 3
+        width_scale: 1.00
+        depth_scale: 1.00
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: csp_large
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': scaled
+      scale_xy:
+        'all': 2.0
+      max_boxes: 300
+      nms_type: iou
+      iou_thresh: 0.001
+      nms_thresh: 0.65
+    loss:
+      use_scaled_loss: true
+      update_on_repeat: true
+      box_loss_type:  
+        'all': ciou
+      ignore_thresh:
+        'all': 0.0
+      iou_normalizer: 
+        'all': 0.05
+      cls_normalizer: 
+        'all': 0.5
+      object_normalizer: 
+        '6': 0.1
+        '5': 0.4
+        '4': 1.0
+        '3': 4.0
+      objectness_smooth: 
+        'all': 1.0
+    norm_activation:
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 4
+      boxes:  [box: [13,17], box: [31,25], box: [24,51], box: [61,45], 
+              box: [61,45], box: [48,102], box: [119,96], box: [97,189], 
+              box: [97,189], box: [217,184], box: [171,384], box: [324,451], 
+              box: [324,451], box: [545,357], box: [616,618], box: [1024,1024]]
+  train_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/train*'
+    shuffle_buffer_size: 10000
+    parser:
+      mosaic:
+        mosaic_frequency: 1.0
+        mixup_frequency: 0.0
+        mosaic_crop_mode: 'scale'
+        mosaic_center: 0.0
+        aug_scale_min: 0.5
+        aug_scale_max: 1.5
+      max_num_instances: 300
+      letter_box: true
+      random_flip: true
+      aug_rand_translate: 0.5
+      area_thresh: 0.0
+  validation_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/val*'
diff --git a/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_l_p7_1536_tpu.yaml b/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_l_p7_1536_tpu.yaml
new file mode 100644
index 00000000000..4295d1ce828
--- /dev/null
+++ b/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_l_p7_1536_tpu.yaml
@@ -0,0 +1,80 @@
+# --experiment_type=large_yolo
+# mAP 54.6%
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'float32'
+  tpu_enable_xla_dynamic_padder: false
+task:
+  model:
+    input_size: [1536, 1536, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'csp-large'
+        max_level: 7
+        min_level: 3
+        width_scale: 1.25
+        depth_scale: 1.00
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: csp_large
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': scaled
+      scale_xy:
+        'all': 2.0
+      max_boxes: 300
+      nms_type: iou
+      iou_thresh: 0.001
+      nms_thresh: 0.65
+    loss:
+      use_scaled_loss: true
+      update_on_repeat: true
+      box_loss_type:  
+        'all': ciou
+      ignore_thresh:
+        'all': 0.0
+      iou_normalizer: 
+        'all': 0.05
+      cls_normalizer: 
+        'all': 0.5
+      object_normalizer: 
+        '7': 0.1
+        '6': 0.4
+        '5': 0.5
+        '4': 1.0
+        '3': 4.0
+      objectness_smooth: 
+        'all': 1.0
+    norm_activation:
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 4
+      boxes:  [box: [13,17], box: [22,25], box: [55,41], box: [27,66],
+              box: [57,88], box: [112,69], box: [69,177], box: [136,138],  
+              box: [136,138], box: [287,114], box: [134,275], box: [268,248],
+              box: [268,248], box: [232,504], box: [445,416], box: [640,640], 
+              box: [812,393], box: [477,808], box: [1070,908], box: [1408,1408]]
+  train_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/train*'
+    shuffle_buffer_size: 10000
+    parser:
+      mosaic:
+        mosaic_frequency: 1.0
+        mixup_frequency: 0.0
+        mosaic_crop_mode: 'scale'
+        mosaic_center: 0.0
+        aug_scale_min: 0.5
+        aug_scale_max: 1.5
+      max_num_instances: 300
+      letter_box: true
+      random_flip: true
+      aug_rand_translate: 0.5
+      area_thresh: 0.1
+  validation_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/val*'
diff --git a/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_tiny_416_tpu.yaml b/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_tiny_416_tpu.yaml
new file mode 100755
index 00000000000..f31ecd0bb16
--- /dev/null
+++ b/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_tiny_416_tpu.yaml
@@ -0,0 +1,75 @@
+# --experiment_type=yolo_darknet
+# mAP 43.0
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'bfloat16'
+task:
+  model:
+    input_size: [416, 416, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'cspdarknettiny'
+        max_level: 5
+        min_level: 4
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: tiny
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': scaled
+      scale_xy:
+        'all': 2.0
+      max_boxes: 300
+      nms_type: iou
+      iou_thresh: 0.001
+      nms_thresh: 0.65
+    loss:
+      use_scaled_loss: true
+      update_on_repeat: true
+      box_loss_type:
+        'all': ciou
+      ignore_thresh:
+        'all': 0.0
+      iou_normalizer:
+        'all': 0.05
+      cls_normalizer:
+        'all': 0.3
+      object_normalizer:
+        '5': 0.28
+        '4': 0.70
+        '3': 2.80
+      objectness_smooth:
+        'all': 1.0
+    norm_activation:
+      activation: leaky
+      norm_epsilon: 0.0001
+      norm_momentum: 0.97
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 3
+      boxes: [box: [10, 14], box: [23, 27], box: [37, 58], 
+              box: [81, 82], box: [135, 169], box: [344, 319]]
+  train_data:
+    input_path: 'gs://cam2-datasets/coco/train*'
+    shuffle_buffer_size: 10000
+    parser:
+      mosaic:
+        mosaic_frequency: 1.0
+        mixup_frequency: 0.0
+        mosaic_crop_mode: 'scale'
+        mosaic_center: 0.25
+        aug_scale_min: 0.1
+        aug_scale_max: 1.9
+      max_num_instances: 300
+      letter_box: true
+      random_flip: true
+      aug_rand_translate: 0.1
+      area_thresh: 0.1
+  validation_data:
+    input_path: 'gs://cam2-datasets/coco/val*'
diff --git a/official/vision/beta/projects/yolo/configs/experiments/yolov4-csp/tpu/640.yaml b/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/tpu/640.yaml
similarity index 100%
rename from official/vision/beta/projects/yolo/configs/experiments/yolov4-csp/tpu/640.yaml
rename to official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/tpu/640.yaml
diff --git a/official/vision/beta/projects/yolo/configs/experiments/yolov4/tpu/512.yaml b/official/vision/beta/projects/yolo/configs/experiments/yolov4/detection/yolov4_512_tpu.yaml
similarity index 100%
rename from official/vision/beta/projects/yolo/configs/experiments/yolov4/tpu/512.yaml
rename to official/vision/beta/projects/yolo/configs/experiments/yolov4/detection/yolov4_512_tpu.yaml
diff --git a/official/vision/beta/projects/yolo/configs/experiments/yolov4/detection/yolov4_tiny_416_tpu.yaml b/official/vision/beta/projects/yolo/configs/experiments/yolov4/detection/yolov4_tiny_416_tpu.yaml
new file mode 100755
index 00000000000..c992bc5f8e1
--- /dev/null
+++ b/official/vision/beta/projects/yolo/configs/experiments/yolov4/detection/yolov4_tiny_416_tpu.yaml
@@ -0,0 +1,102 @@
+# --experiment_type=yolo_darknet
+# mAP 43.0
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'bfloat16'
+task:
+  smart_bias_lr: 0.0
+  model:
+    darknet_based_model: true
+    input_size: [416, 416, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'cspdarknettiny'
+        max_level: 5
+        min_level: 4
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: tiny
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': original
+      scale_xy:
+        '5': 1.05
+        '4': 1.05
+      max_boxes: 300
+      nms_type: iou
+      iou_thresh: 0.001
+      nms_thresh: 0.60
+    loss:
+      use_scaled_loss: false
+      box_loss_type:
+        'all': ciou
+      ignore_thresh:
+        'all': 0.7
+      iou_normalizer:
+        'all': 0.07
+      cls_normalizer:
+        'all': 1.0
+      object_normalizer:
+        'all': 1.0
+      objectness_smooth:
+        'all': 0.0
+      max_delta:
+        'all': 5.0
+    norm_activation:
+      activation: leaky
+      norm_epsilon: 0.0001
+      norm_momentum: 0.99
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 3
+      boxes: [box: [10, 14], box: [23, 27], box: [37, 58], 
+              box: [81, 82], box: [135, 169], box: [344, 319]]
+  train_data:
+    global_batch_size: 256
+    shuffle_buffer_size: 10000
+    dtype: float32
+    input_path: 'gs://cam2-datasets/coco/train*'
+    is_training: true
+    drop_remainder: true
+    parser:
+      mosaic:
+        mosaic_frequency: 0.75
+        mixup_frequency: 0.0
+        mosaic_crop_mode: 'crop'
+        mosaic_center: 0.2
+        aug_scale_min: 0.7
+        aug_scale_max: 1.3
+        jitter: 0.3
+      max_num_instances: 200
+      letter_box: false
+      random_flip: true
+      aug_rand_saturation: 1.5
+      aug_rand_brightness: 1.5
+      aug_rand_hue: 0.1
+      aug_scale_min: 0.5
+      aug_scale_max: 1.5
+      aug_rand_translate: 0.0
+      jitter: 0.3
+      area_thresh: 0.1
+      random_pad: true
+      use_tie_breaker: true
+      anchor_thresh: 0.4
+  validation_data:
+    global_batch_size: 8
+    dtype: float32
+    input_path: 'gs://cam2-datasets/coco/val*'
+    is_training: false
+    drop_remainder: true
+    parser:
+      max_num_instances: 200
+      letter_box: false
+      use_tie_breaker: true
+      anchor_thresh: 0.4
+  weight_decay: 0.000
+  annotation_file: null
diff --git a/official/vision/beta/projects/yolo/configs/experiments/yolov4/imagenet_pretraining/cspdarknet53_256_tpu.yaml b/official/vision/beta/projects/yolo/configs/experiments/yolov4/imagenet_pretraining/cspdarknet53_256_tpu.yaml
new file mode 100644
index 00000000000..10dbdc56855
--- /dev/null
+++ b/official/vision/beta/projects/yolo/configs/experiments/yolov4/imagenet_pretraining/cspdarknet53_256_tpu.yaml
@@ -0,0 +1,51 @@
+runtime:
+  distribution_strategy: 'mirrored'
+  mixed_precision_dtype: 'float32'
+task:
+  model:
+    num_classes: 1001
+    input_size: [256, 256, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'cspdarknet53'
+    norm_activation:
+      activation: 'mish'
+  losses:
+    l2_weight_decay: 0.0005
+    one_hot: true
+    label_smoothing: 0.1
+  train_data:
+    input_path: 'imagenet-2012-tfrecord/train*'
+    is_training: true
+    global_batch_size: 128
+    dtype: 'float16'
+  validation_data:
+    input_path: 'imagenet-2012-tfrecord/valid*'
+    is_training: true
+    global_batch_size: 128
+    dtype: 'float16'
+    drop_remainder: false
+trainer:
+  train_steps: 1200000  # epochs: 120
+  validation_steps: 400  # size of validation data
+  validation_interval: 10000
+  steps_per_loop: 10000
+  summary_interval: 10000
+  checkpoint_interval: 10000
+  optimizer_config:
+    optimizer:
+      type: 'sgd'
+      sgd:
+        momentum: 0.9
+    learning_rate:
+      type: 'polynomial'
+      polynomial:
+        initial_learning_rate: 0.1
+        end_learning_rate: 0.0001
+        power: 4.0
+        decay_steps: 1200000
+    warmup:
+      type: 'linear'
+      linear:
+        warmup_steps: 1000  # learning rate rises from 0 to 0.1 over 1000 steps
diff --git a/official/vision/beta/projects/yolo/configs/experiments/yolox/yolov4_512_tpu.yaml b/official/vision/beta/projects/yolo/configs/experiments/yolox/yolov4_512_tpu.yaml
new file mode 100644
index 00000000000..e918d34581b
--- /dev/null
+++ b/official/vision/beta/projects/yolo/configs/experiments/yolox/yolov4_512_tpu.yaml
@@ -0,0 +1,135 @@
+# --experiment_type=yolo_darknet
+# mAP 43.0
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'bfloat16'
+task:
+  smart_bias_lr: 0.0
+  model:
+    darknet_based_model: true
+    input_size: [512, 512, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'cspdarknet53'
+        max_level: 5
+        min_level: 3
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: regular
+        activation: leaky
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': original
+      scale_xy:
+        '5': 1.05
+        '4': 1.1
+        '3': 1.2
+      max_boxes: 200
+      nms_type: iou
+      iou_thresh: 0.001
+      nms_thresh: 0.60
+    loss:
+      use_scaled_loss: false
+      box_loss_type:
+        'all': ciou
+      ignore_thresh:
+        'all': 0.7
+      iou_normalizer:
+        'all': 0.07
+      cls_normalizer:
+        'all': 1.0
+      object_normalizer:
+        'all': 1.0
+      objectness_smooth:
+        'all': 0.0
+      max_delta:
+        'all': 5.0
+    norm_activation:
+      activation: mish
+      norm_epsilon: 0.0001
+      norm_momentum: 0.99
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 3
+      boxes: [box: [12, 16], box: [19, 36], box: [40, 28],
+              box: [36, 75], box: [76, 55], box: [72, 146],
+              box: [142, 110], box: [192, 243], box: [459, 401]]
+  train_data:
+    global_batch_size: 64
+    dtype: float32
+    input_path: 'gs://cam2-datasets/coco/train*'
+    is_training: true
+    drop_remainder: true
+    seed: 1000
+    parser:
+      mosaic:
+        mosaic_frequency: 0.75
+        mixup_frequency: 0.0
+        mosaic_crop_mode: 'crop'
+        mosaic_center: 0.2
+        aug_scale_min: 0.2
+        aug_scale_max: 1.6
+        jitter: 0.3
+      max_num_instances: 200
+      letter_box: false
+      random_flip: true
+      aug_rand_saturation: 1.5
+      aug_rand_brightness: 1.5
+      aug_rand_hue: 0.1
+      aug_scale_min: 0.1
+      aug_scale_max: 1.9
+      aug_rand_translate: 0.0
+      jitter: 0.3
+      area_thresh: 0.1
+      random_pad: true
+      use_tie_breaker: true
+      anchor_thresh: 0.4
+  validation_data:
+    global_batch_size: 8
+    dtype: float32
+    input_path: 'gs://cam2-datasets/coco/val*'
+    is_training: false
+    drop_remainder: true
+    parser:
+      max_num_instances: 200
+      letter_box: false
+      use_tie_breaker: true
+      anchor_thresh: 0.4
+  weight_decay: 0.000
+trainer:
+  train_steps: 555000
+  validation_steps: 625
+  steps_per_loop: 1850
+  summary_interval: 1850
+  validation_interval: 9250
+  checkpoint_interval: 1850
+  optimizer_config:
+    ema:
+      average_decay: 0.9998
+      trainable_weights_only: false
+      dynamic_decay: true
+    learning_rate:
+      type: stepwise
+      stepwise:
+        boundaries: [400000]
+        name: PiecewiseConstantDecay
+        values: [0.00131, 0.000131]
+    optimizer:
+      type: sgd_torch
+      sgd_torch:
+        momentum: 0.949
+        momentum_start: 0.949
+        nesterov: true
+        warmup_steps: 1000
+        weight_decay: 0.0005
+        name: SGD
+    warmup:
+      type: 'linear'
+      linear:
+        warmup_steps: 1000  # learning rate rises from 0 to 0.0013 over 1000 steps
diff --git a/official/vision/beta/projects/yolo/configs/experiments/yolox/yolox.yaml b/official/vision/beta/projects/yolo/configs/experiments/yolox/yolox.yaml
new file mode 100644
index 00000000000..1b3ae51979f
--- /dev/null
+++ b/official/vision/beta/projects/yolo/configs/experiments/yolox/yolox.yaml
@@ -0,0 +1,92 @@
+# --experiment_type=yolox
+# mAP 
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'float32'
+  tpu_enable_xla_dynamic_padder: false
+task:
+  model:
+    input_size: [640, 640, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'darknet53'
+        max_level: 5
+        min_level: 3
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: vx
+        type: regular
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': anchor_free
+      scale_xy:
+        'all': 2.0
+      max_boxes: 300
+      nms_type: iou
+      iou_thresh: 0.001
+      nms_thresh: 0.65
+    loss:
+      use_scaled_loss: true
+      update_on_repeat: true
+      box_loss_type:
+        'all': ciou
+      ignore_thresh:
+        'all': 0.0
+      iou_normalizer:
+        'all': 0.05
+      cls_normalizer:
+        'all': 0.3
+      object_normalizer:
+        '5': 0.28
+        '4': 0.70
+        '3': 2.80
+      objectness_smooth:
+        'all': 1.0
+    norm_activation:
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      level_limits: [64, 128]
+      anchors_per_scale: 1
+  train_data:
+    input_path: 'gs://cam2-datasets/coco/train*'
+    shuffle_buffer_size: 10000
+    parser:
+      mosaic:
+        mosaic_frequency: 1.0
+        mixup_frequency: 0.2
+        mosaic_crop_mode: 'scale'
+        mosaic_center: 0.25
+        aug_scale_min: 0.1
+        aug_scale_max: 1.9
+      max_num_instances: 300
+      letter_box: true
+      random_flip: true
+      aug_rand_translate: 0.1
+      area_thresh: 0.1
+  validation_data:
+    input_path: 'gs://cam2-datasets/coco/val*'
+trainer:
+  train_steps: 277200 # epoch 300
+  optimizer_config:
+    learning_rate:
+      cosine:
+        initial_learning_rate: 0.1
+        decay_steps: 277200 # epoch 300
+    optimizer:
+      type: sgd_torch
+      sgd_torch:
+        momentum: 0.9
+        momentum_start: 0.9
+        nesterov: true
+        warmup_steps: 4620
+        weight_decay: 0.0005
+        name: SGD
+    warmup:
+      type: 'linear'
+      linear:
+        warmup_steps: 4620  # 5 epochs
diff --git a/official/vision/beta/projects/yolo/configs/head.py b/official/vision/beta/projects/yolo/configs/head.py
new file mode 100644
index 00000000000..0014b3e9a09
--- /dev/null
+++ b/official/vision/beta/projects/yolo/configs/head.py
@@ -0,0 +1,41 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Decoders configurations."""
+import dataclasses
+from typing import Optional
+
+from official.modeling import hyperparams
+from official.vision.beta.configs import head
+
+
+@dataclasses.dataclass
+class YOLOXHead(hyperparams.Config):
+  """Builds Yolo decoder.
+
+  If the name is specified, or version is specified we ignore input parameters
+  and use version and name defaults.
+  """
+  num_classes: Optional[str] = None
+  width: Optional[str] = 1.0
+  strides: Optional[str] = [8, 16, 32]
+  in_channels: Optional[str] = [256, 512, 1024]
+  depthwise: Optional[str] = False
+  activation: Optional[str] = 'silu'
+
+
+@dataclasses.dataclass
+class Head(decoders.Decoder):
+  type: Optional[str] = 'yolox_head'
+  yolox_head: YOLOXHead = YOLOXHead()
diff --git a/official/vision/beta/projects/yolo/configs/yolo.py b/official/vision/beta/projects/yolo/configs/yolo.py
index 37af5e73940..6435d09f708 100755
--- a/official/vision/beta/projects/yolo/configs/yolo.py
+++ b/official/vision/beta/projects/yolo/configs/yolo.py
@@ -24,9 +24,7 @@
 from official.modeling import hyperparams
 from official.vision.beta.configs import common
 from official.vision.beta.projects.yolo import optimization
-from official.vision.beta.projects.yolo.configs import backbones
-from official.vision.beta.projects.yolo.configs import decoders
-
+from official.vision.beta.projects.yolo.configs import backbones, decoders
 
 # pytype: disable=annotation-type-mismatch
 
@@ -137,6 +135,15 @@ class YoloHead(hyperparams.Config):
   smart_bias: bool = True
 
 
+@dataclasses.dataclass
+class YoloxHead(hyperparams.Config):
+  """Parameterization for the YOLOX Head."""
+  width: float = 1.0
+  depthwise: bool = False
+  activation: str = 'silu'
+  smart_bias: bool = True
+
+
 @dataclasses.dataclass
 class YoloDetectionGenerator(hyperparams.Config):
   box_type: FPNConfig = dataclasses.field(
@@ -173,8 +180,31 @@ class YoloLoss(hyperparams.Config):
   label_smoothing: float = 0.0
   use_scaled_loss: bool = True
   update_on_repeat: bool = True
-
+     
 
+@dataclasses.dataclass
+class YoloxLoss(hyperparams.Config):
+  ignore_thresh: FPNConfig = dataclasses.field(
+      default_factory=_build_dict(MIN_LEVEL, MAX_LEVEL, 0.0))
+  truth_thresh: FPNConfig = dataclasses.field(
+      default_factory=_build_dict(MIN_LEVEL, MAX_LEVEL, 1.0))
+  box_loss_type: FPNConfig = dataclasses.field(
+      default_factory=_build_dict(MIN_LEVEL, MAX_LEVEL, 'iou'))
+  iou_normalizer: FPNConfig = dataclasses.field(
+      default_factory=_build_dict(MIN_LEVEL, MAX_LEVEL, 1.0))
+  cls_normalizer: FPNConfig = dataclasses.field(
+      default_factory=_build_dict(MIN_LEVEL, MAX_LEVEL, 1.0))
+  object_normalizer: FPNConfig = dataclasses.field(
+      default_factory=_build_dict(MIN_LEVEL, MAX_LEVEL, 1.0))
+  max_delta: FPNConfig = dataclasses.field(
+      default_factory=_build_dict(MIN_LEVEL, MAX_LEVEL, np.inf))
+  objectness_smooth: FPNConfig = dataclasses.field(
+      default_factory=_build_dict(MIN_LEVEL, MAX_LEVEL, 0.0))
+  label_smoothing: float = 0.0
+  use_scaled_loss: bool = True
+  update_on_repeat: bool = True
+    
+    
 @dataclasses.dataclass
 class Box(hyperparams.Config):
   box: List[int] = dataclasses.field(default=list)
@@ -233,6 +263,28 @@ class Yolo(hyperparams.Config):
   darknet_based_model: bool = False
 
 
+@dataclasses.dataclass
+class Yolox(hyperparams.Config):
+  input_size: Optional[List[int]] = dataclasses.field(
+      default_factory=lambda: [640, 640, 3])
+  backbone: backbones.Backbone = backbones.Backbone(
+      type='darknet', darknet=backbones.Darknet(model_id='darknet53'))
+  decoder: decoders.Decoder = decoders.Decoder(
+      type='yolo_decoder',
+      yolo_decoder=decoders.YoloDecoder(version='vx', type='regular'))
+  head: YoloxHead = YoloxHead()
+  detection_generator: YoloDetectionGenerator = YoloDetectionGenerator()
+  loss: YoloLoss = YoloLoss()
+  norm_activation: common.NormActivation = common.NormActivation(
+      activation='mish',
+      use_sync_bn=True,
+      norm_momentum=0.99,
+      norm_epsilon=0.001)
+  num_classes: int = 80
+  anchor_boxes: AnchorBoxes = AnchorBoxes()
+  darknet_based_model: bool = False
+
+
 @dataclasses.dataclass
 class YoloTask(cfg.TaskConfig):
   per_category_metrics: bool = False
@@ -483,7 +535,7 @@ def scaled_yolo() -> cfg.ExperimentConfig:
                       'momentum_start': 0.8,
                       'nesterov': True,
                       'warmup_steps': steps_per_epoch * warmup_epochs,
-                      'weight_decay': 0.0005 * train_batch_size / 64.0,
+                      'weight_decay': 0.0005,
                   }
               },
               'learning_rate': {
@@ -508,3 +560,341 @@ def scaled_yolo() -> cfg.ExperimentConfig:
       ])
 
   return config
+
+
+@exp_factory.register_config_factory('large_yolo')
+def large_yolo() -> cfg.ExperimentConfig:
+  """COCO object detection with YOLOv4-csp and v4."""
+  train_batch_size = 64
+  eval_batch_size = 8
+  train_epochs = 300
+  fine_tune_epochs = 450
+  warmup_epochs = 3
+
+  validation_interval = 5
+  steps_per_epoch = COCO_TRAIN_EXAMPLES // train_batch_size
+
+  max_num_instances = 300
+
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
+      task=YoloTask(
+          smart_bias_lr=0.1,
+          init_checkpoint_modules='',
+          annotation_file=None,
+          weight_decay=0.0,
+          model=Yolo(
+              darknet_based_model=False,
+              norm_activation=common.NormActivation(
+                  activation='mish',
+                  use_sync_bn=True,
+                  norm_epsilon=0.0001,
+                  norm_momentum=0.97),
+              head=YoloHead(smart_bias=True),
+              loss=YoloLoss(use_scaled_loss=True)),
+          train_data=DataConfig(
+              input_path=os.path.join(COCO_INPUT_PATH_BASE, 'train*'),
+              is_training=True,
+              global_batch_size=train_batch_size,
+              dtype='float32',
+              parser=Parser(
+                  aug_rand_saturation=0.7,
+                  aug_rand_brightness=0.4,
+                  aug_rand_hue=0.015,
+                  letter_box=True,
+                  use_tie_breaker=True,
+                  best_match_only=True,
+                  anchor_thresh=4.0,
+                  random_pad=False,
+                  area_thresh=0.1,
+                  max_num_instances=max_num_instances,
+                  mosaic=Mosaic(
+                      mosaic_crop_mode='scale',
+                      mosaic_frequency=1.0,
+                      mixup_frequency=0.0,
+                  ))),
+          validation_data=DataConfig(
+              input_path=os.path.join(COCO_INPUT_PATH_BASE, 'val*'),
+              is_training=False,
+              global_batch_size=eval_batch_size,
+              drop_remainder=True,
+              dtype='float32',
+              parser=Parser(
+                  letter_box=True,
+                  use_tie_breaker=True,
+                  best_match_only=True,
+                  anchor_thresh=4.0,
+                  area_thresh=0.1,
+                  max_num_instances=max_num_instances,
+              ))),
+      trainer=cfg.TrainerConfig(
+          train_steps=train_epochs * steps_per_epoch,
+          validation_steps=COCO_VAL_EXAMPLES // eval_batch_size,
+          validation_interval=validation_interval * steps_per_epoch,
+          steps_per_loop=steps_per_epoch,
+          summary_interval=steps_per_epoch,
+          checkpoint_interval=steps_per_epoch,
+          optimizer_config=optimization.OptimizationConfig({
+              'ema': {
+                  'average_decay': 0.9999,
+                  'trainable_weights_only': False,
+                  'dynamic_decay': True,
+              },
+              'optimizer': {
+                  'type': 'sgd_torch',
+                  'sgd_torch': {
+                      'momentum': 0.937,
+                      'momentum_start': 0.9,
+                      'nesterov': True,
+                      'warmup_steps': steps_per_epoch * warmup_epochs,
+                      'weight_decay': 0.0005,
+                  }
+              },
+              'learning_rate': {
+                  'type': 'cosine',
+                  'cosine': {
+                      'initial_learning_rate': 0.01,
+                      'alpha': 0.2,
+                      'decay_steps': fine_tune_epochs * steps_per_epoch,
+                  }
+              },
+              'warmup': {
+                  'type': 'linear',
+                  'linear': {
+                      'warmup_steps': steps_per_epoch * warmup_epochs,
+                      'warmup_learning_rate': 0
+                  }
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+
+  return config
+
+
+@exp_factory.register_config_factory('yolo_tiny')
+def yolo_tiny() -> cfg.ExperimentConfig:
+  """COCO object detection with YOLOv3 and v4."""
+  train_batch_size = 256
+  eval_batch_size = 8
+  train_epochs = 600
+  steps_per_epoch = COCO_TRAIN_EXAMPLES // train_batch_size
+  validation_interval = 10
+
+  max_num_instances = 200
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
+      task=YoloTask(
+          smart_bias_lr=0.1,
+          init_checkpoint='',
+          init_checkpoint_modules='backbone',
+          annotation_file=None,
+          weight_decay=0.0,
+          model=Yolo(
+              darknet_based_model=True,
+              norm_activation=common.NormActivation(use_sync_bn=True, 
+                                                    activation="leaky"),
+              head=YoloHead(smart_bias=True),
+              loss=YoloLoss(use_scaled_loss=False, update_on_repeat=True)),
+          train_data=DataConfig(
+              input_path=os.path.join(COCO_INPUT_PATH_BASE, 'train*'),
+              is_training=True,
+              global_batch_size=train_batch_size,
+              dtype='float32',
+              parser=Parser(
+                  letter_box=False,
+                  aug_rand_saturation=1.5,
+                  aug_rand_brightness=1.5,
+                  aug_rand_hue=0.1,
+                  use_tie_breaker=True,
+                  best_match_only=False,
+                  anchor_thresh=0.4,
+                  area_thresh=0.1,
+                  max_num_instances=max_num_instances,
+                  mosaic=Mosaic(
+                      mosaic_frequency=0.75,
+                      mixup_frequency=0.0,
+                      mosaic_crop_mode='crop',
+                      mosaic_center=0.2))),
+          validation_data=DataConfig(
+              input_path=os.path.join(COCO_INPUT_PATH_BASE, 'val*'),
+              is_training=False,
+              global_batch_size=eval_batch_size,
+              drop_remainder=True,
+              dtype='float32',
+              parser=Parser(
+                  letter_box=False,
+                  use_tie_breaker=True,
+                  best_match_only=False,
+                  anchor_thresh=0.4,
+                  area_thresh=0.1,
+                  max_num_instances=max_num_instances,
+              ))),
+      trainer=cfg.TrainerConfig(
+          train_steps=train_epochs * steps_per_epoch,
+          validation_steps=COCO_VAL_EXAMPLES // eval_batch_size,
+          validation_interval=validation_interval * steps_per_epoch,
+          steps_per_loop=steps_per_epoch,
+          summary_interval=steps_per_epoch,
+          checkpoint_interval=steps_per_epoch,
+          optimizer_config=optimization.OptimizationConfig({
+              'ema': {
+                  'average_decay': 0.9998,
+                  'trainable_weights_only': False,
+                  'dynamic_decay': True,
+              },
+              'optimizer': {
+                  'type': 'sgd_torch',
+                  'sgd_torch': {
+                      'momentum': 0.9,
+                      'momentum_start': 0.9,
+                      'nesterov': True,
+                      'warmup_steps': 1000,
+                      'weight_decay': 0.0005,
+                  }
+              },
+              'learning_rate': {
+                  'type': 'stepwise',
+                  'stepwise': {
+                      'boundaries': [
+                          0.8 * train_epochs * steps_per_epoch
+                      ],
+                      'values': [
+                          0.00261 * train_batch_size / 64.0,
+                          0.000261 * train_batch_size / 64.0,
+                      ]
+                  }
+              },
+              'warmup': {
+                  'type': 'linear',
+                  'linear': {
+                      'warmup_steps': 1000,
+                      'warmup_learning_rate': 0
+                  }
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+
+  return config
+
+
+@exp_factory.register_config_factory('yolox_regular')
+def yolox_regular() -> cfg.ExperimentConfig:
+  """COCO object detection with YOLOvx."""
+  train_batch_size = 128
+  eval_batch_size = 8
+  train_epochs = 300
+  steps_per_epoch = COCO_TRAIN_EXAMPLES // train_batch_size
+  validation_interval = 5
+
+  max_num_instances = 200
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
+      task=YoloTask(
+          smart_bias_lr=0.1,
+          init_checkpoint='',
+          init_checkpoint_modules='backbone',
+          annotation_file=None,
+          weight_decay=0.0,
+          model=Yolox(
+              darknet_based_model=True,
+              norm_activation=common.NormActivation(use_sync_bn=True),
+              head=YoloxHead(smart_bias=True),
+              loss=YoloLoss(use_scaled_loss=True, update_on_repeat=True),
+              anchor_boxes=AnchorBoxes(
+                  anchors_per_scale=3,
+                  boxes=[
+                      Box(box=[12, 16]),
+                      Box(box=[19, 36]),
+                      Box(box=[40, 28]),
+                      Box(box=[36, 75]),
+                      Box(box=[76, 55]),
+                      Box(box=[72, 146]),
+                      Box(box=[142, 110]),
+                      Box(box=[192, 243]),
+                      Box(box=[459, 401])
+                  ])),
+          train_data=DataConfig(
+              input_path=os.path.join(COCO_INPUT_PATH_BASE, 'train*'),
+              is_training=True,
+              global_batch_size=train_batch_size,
+              dtype='float32',
+              parser=Parser(
+                  letter_box=False,
+                  aug_rand_saturation=1.5,
+                  aug_rand_brightness=1.5,
+                  aug_rand_hue=0.1,
+                  use_tie_breaker=True,
+                  best_match_only=False,
+                  anchor_thresh=0.4,
+                  area_thresh=0.1,
+                  max_num_instances=max_num_instances,
+                  mosaic=Mosaic(
+                      mosaic_frequency=0.75,
+                      mixup_frequency=0.0,
+                      mosaic_crop_mode='crop',
+                      mosaic_center=0.2))),
+          validation_data=DataConfig(
+              input_path=os.path.join(COCO_INPUT_PATH_BASE, 'val*'),
+              is_training=False,
+              global_batch_size=eval_batch_size,
+              drop_remainder=True,
+              dtype='float32',
+              parser=Parser(
+                  letter_box=False,
+                  use_tie_breaker=True,
+                  best_match_only=False,
+                  anchor_thresh=0.4,
+                  area_thresh=0.1,
+                  max_num_instances=max_num_instances,
+              ))),
+      trainer=cfg.TrainerConfig(
+          train_steps=train_epochs * steps_per_epoch,
+          validation_steps=COCO_VAL_EXAMPLES // eval_batch_size,
+          validation_interval=validation_interval * steps_per_epoch,
+          steps_per_loop=steps_per_epoch,
+          summary_interval=steps_per_epoch,
+          checkpoint_interval=steps_per_epoch,
+          optimizer_config=optimization.OptimizationConfig({
+              'ema': {
+                  'average_decay': 0.9998,
+                  'trainable_weights_only': False,
+                  'dynamic_decay': True,
+              },
+              'optimizer': {
+                  'type': 'sgd_torch',
+                  'sgd_torch': {
+                      'momentum': 0.949,
+                      'momentum_start': 0.949,
+                      'nesterov': True,
+                      'warmup_steps': 1000,
+                      'weight_decay': 0.0005,
+                  }
+              },
+              'learning_rate': {
+                  'type': 'cosine',
+                  'cosine': {
+                      'initial_learning_rate': 0.02,
+                      'alpha': 0.2,
+                      'decay_steps': train_epochs * steps_per_epoch,
+                  }
+              },
+              'warmup': {
+                  'type': 'linear',
+                  'linear': {
+                      'warmup_steps': 5 * steps_per_epoch,
+                      'warmup_learning_rate': 0
+                  }
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+
+  return config
diff --git a/official/vision/beta/projects/yolo/dataloaders/__init__.py b/official/vision/beta/projects/yolo/dataloaders/__init__.py
index a25710c222e..e04127d3fc8 100644
--- a/official/vision/beta/projects/yolo/dataloaders/__init__.py
+++ b/official/vision/beta/projects/yolo/dataloaders/__init__.py
@@ -11,5 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-
diff --git a/official/vision/beta/projects/yolo/dataloaders/classification_input.py b/official/vision/beta/projects/yolo/dataloaders/classification_input.py
index 57d7ec2382a..07498eb6476 100755
--- a/official/vision/beta/projects/yolo/dataloaders/classification_input.py
+++ b/official/vision/beta/projects/yolo/dataloaders/classification_input.py
@@ -14,6 +14,7 @@
 
 """Classification decoder and parser."""
 import tensorflow as tf
+
 from official.vision.beta.dataloaders import classification_input
 from official.vision.beta.ops import preprocess_ops
 
diff --git a/official/vision/beta/projects/yolo/dataloaders/yolo_input.py b/official/vision/beta/projects/yolo/dataloaders/yolo_input.py
index a9953ce1e8d..bda46d15382 100755
--- a/official/vision/beta/projects/yolo/dataloaders/yolo_input.py
+++ b/official/vision/beta/projects/yolo/dataloaders/yolo_input.py
@@ -15,12 +15,10 @@
 """Detection Data parser and processing for YOLO."""
 import tensorflow as tf
 
-from official.vision.beta.dataloaders import parser
-from official.vision.beta.dataloaders import utils
+from official.vision.beta.dataloaders import parser, utils
 from official.vision.beta.ops import box_ops as bbox_ops
 from official.vision.beta.ops import preprocess_ops
-from official.vision.beta.projects.yolo.ops import anchor
-from official.vision.beta.projects.yolo.ops import preprocessing_ops
+from official.vision.beta.projects.yolo.ops import anchor, preprocessing_ops
 
 
 class Parser(parser.Parser):
@@ -237,14 +235,14 @@ def _parse_train_data(self, data):
           affine=affine,
           shuffle_boxes=False,
           area_thresh=self._area_thresh,
-          augment=True,
+          filter_and_clip_boxes=True,
           seed=self._seed)
       classes = tf.gather(classes, inds)
       info = infos[-1]
     else:
       image = tf.image.resize(
           image, (self._image_h, self._image_w), method='nearest')
-      output_size = tf.cast([640, 640], tf.float32)
+      output_size = tf.cast([self._image_h, self._image_w], tf.float32)
       boxes_ = bbox_ops.denormalize_boxes(boxes, output_size)
       inds = bbox_ops.get_non_empty_box_indices(boxes_)
       boxes = tf.gather(boxes, inds)
@@ -286,7 +284,8 @@ def _parse_eval_data(self, data):
     # Clip and clean boxes.
     image = image / 255.0
     boxes, inds = preprocessing_ops.transform_and_clip_boxes(
-        boxes, infos, shuffle_boxes=False, area_thresh=0.0, augment=True)
+        boxes, infos, shuffle_boxes=False, area_thresh=0.0, 
+        filter_and_clip_boxes=False)
     classes = tf.gather(classes, inds)
     info = infos[-1]
 
@@ -342,17 +341,17 @@ def _build_label(self,
 
     # Update the labels dictionary.
     if not is_training:
-
       # Sets up groundtruth data for evaluation.
       groundtruths = {
           'source_id': labels['source_id'],
-          'height': height,
-          'width': width,
-          'num_detections': tf.shape(gt_boxes)[0],
+          'height': data["height"],
+          'width': data["width"],
+          'num_detections': tf.shape(data["groundtruth_boxes"])[0],
           'image_info': info,
-          'boxes': gt_boxes,
-          'classes': gt_classes,
-          'areas': tf.gather(data['groundtruth_area'], inds),
+          'boxes': bbox_ops.denormalize_boxes(data["groundtruth_boxes"],
+                      tf.cast([data["height"], data["width"]], gt_boxes.dtype)),
+          'classes': data["groundtruth_classes"],
+          'areas': data["groundtruth_area"],
           'is_crowds':
               tf.cast(tf.gather(data['groundtruth_is_crowd'], inds), tf.int32),
       }
diff --git a/official/vision/beta/projects/yolo/dataloaders/yolo_input_test.py b/official/vision/beta/projects/yolo/dataloaders/yolo_input_test.py
new file mode 100644
index 00000000000..8c9003a0bfa
--- /dev/null
+++ b/official/vision/beta/projects/yolo/dataloaders/yolo_input_test.py
@@ -0,0 +1,122 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Yolo Dataset Testing functions"""
+import os
+
+import tensorflow as tf
+from absl.testing import parameterized
+
+from official.core import task_factory, train_utils
+from official.vision.beta.projects.yolo.common import \
+    registry_imports  # pylint: disable=unused-import
+from official.vision.beta.projects.yolo.configs import \
+    darknet_classification as dcfg
+from official.vision.beta.projects.yolo.tasks import \
+    image_classification as imc
+
+PATH_TO_COCO = '/media/vbanna/DATA_SHARE/CV/datasets/COCO_raw/records/'
+
+def test_yolo_input_task(scaled_pipeline = True, batch_size = 1):
+  if not scaled_pipeline:
+    experiment = "yolo_darknet"
+    config_path = [
+      "official/vision/beta/projects/yolo/configs/experiments/yolov4/detection/yolov4_512_tpu.yaml"]
+  else:
+    experiment = "large_yolo"
+    # config_path = [
+    #   "official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_l_p6_1280_tpu.yaml"]
+    config_path = [
+      "official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_l_p7_1536_tpu.yaml"]
+
+  config = train_utils.ParseConfigOptions(experiment=experiment, 
+                                          config_file=config_path)
+  params = train_utils.parse_configuration(config)
+  config = params.task
+  task = task_factory.get_task(params.task)
+
+  config.train_data.global_batch_size = batch_size
+  config.validation_data.global_batch_size = 1
+  config.train_data.dtype = 'float32'
+  config.validation_data.dtype = 'float32'
+  config.validation_data.shuffle_buffer_size = 1
+  config.train_data.shuffle_buffer_size = 1
+  config.train_data.input_path = os.path.join(PATH_TO_COCO, 'train*')
+  config.validation_data.input_path = os.path.join(PATH_TO_COCO, 'val*')
+
+  with tf.device('/CPU:0'):
+    train_data = task.build_inputs(config.train_data)
+    test_data = task.build_inputs(config.validation_data)
+  return train_data, test_data, config
+
+def test_yolo_pipeline_visually(is_training=True, num=30):
+  # visualize the datapipeline
+  import matplotlib.pyplot as plt
+  dataset, testing, _ = test_yolo_input_task()
+
+  data = dataset if is_training else testing
+  data = data.take(num)
+  for l, (image, label) in enumerate(data):
+    image = tf.image.draw_bounding_boxes(
+        image, label['bbox'], [[0.0, 1.0, 1.0]])
+
+    gt = label['true_conf']
+
+    obj3 = tf.clip_by_value(gt['3'][..., 0], 0.0, 1.0)
+    obj4 = tf.clip_by_value(gt['4'][..., 0], 0.0, 1.0)
+    obj5 = tf.clip_by_value(gt['5'][..., 0], 0.0, 1.0)
+    obj6 = tf.clip_by_value(gt['6'][..., 0], 0.0, 1.0)
+    obj7 = tf.clip_by_value(gt['7'][..., 0], 0.0, 1.0)
+
+    for shind in range(1):
+      fig, axe = plt.subplots(2, 4)
+
+      image = image[shind]
+
+      axe[0, 0].imshow(image)
+      axe[0, 1].imshow(obj3[shind, ..., :3].numpy())
+      axe[0, 2].imshow(obj4[shind, ..., :3].numpy())
+      axe[0, 3].imshow(obj5[shind, ..., :3].numpy())
+      axe[1, 0].imshow(obj6[shind, ..., :3].numpy())
+      axe[1, 2].imshow(obj7[shind, ..., :3].numpy())
+      axe[1, 1].imshow(obj6[shind, ..., 3].numpy())
+      axe[1, 3].imshow(obj7[shind, ..., 3].numpy())
+
+      fig.set_size_inches(18.5, 6.5, forward=True)
+      plt.tight_layout()
+      plt.show()
+
+class YoloDetectionInputTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(('scaled', True), ('darknet', False))
+  def test_yolo_input(self, scaled_pipeline):
+    # builds a pipline forom the config and tests the datapipline shapes
+    # dataset, _, params = test_yolo_input_task(
+    #     scaled_pipeline=scaled_pipeline, 
+    #     batch_size=1)
+    _, dataset, params = test_yolo_input_task(
+        scaled_pipeline=scaled_pipeline, 
+        batch_size=1)
+
+    dataset = dataset.take(100)
+
+    for image, label in dataset:
+      self.assertAllEqual(image.shape, ([1] + params.model.input_size))
+      self.assertTrue(
+          tf.reduce_all(tf.math.logical_and(image >= 0, image <= 1)))
+
+
+if __name__ == '__main__':
+  # tf.test.main()
+  test_yolo_pipeline_visually(is_training=True, num=20)
diff --git a/official/vision/beta/projects/yolo/losses/__init__.py b/official/vision/beta/projects/yolo/losses/__init__.py
index e419af524b5..e04127d3fc8 100644
--- a/official/vision/beta/projects/yolo/losses/__init__.py
+++ b/official/vision/beta/projects/yolo/losses/__init__.py
@@ -11,4 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
diff --git a/official/vision/beta/projects/yolo/losses/yolo_loss.py b/official/vision/beta/projects/yolo/losses/yolo_loss.py
index aac117bdf58..d16db20fed0 100755
--- a/official/vision/beta/projects/yolo/losses/yolo_loss.py
+++ b/official/vision/beta/projects/yolo/losses/yolo_loss.py
@@ -19,9 +19,8 @@
 
 import tensorflow as tf
 
-from official.vision.beta.projects.yolo.ops import box_ops
-from official.vision.beta.projects.yolo.ops import loss_utils
-from official.vision.beta.projects.yolo.ops import math_ops
+from official.vision.beta.projects.yolo.ops import (box_ops, loss_utils,
+                                                    math_ops)
 
 
 class YoloLossBase(object, metaclass=abc.ABCMeta):
diff --git a/official/vision/beta/projects/yolo/losses/yolo_loss_test.py b/official/vision/beta/projects/yolo/losses/yolo_loss_test.py
index b9490181269..49af31e94b0 100755
--- a/official/vision/beta/projects/yolo/losses/yolo_loss_test.py
+++ b/official/vision/beta/projects/yolo/losses/yolo_loss_test.py
@@ -14,8 +14,8 @@
 
 """Tests for yolo heads."""
 
-from absl.testing import parameterized
 import tensorflow as tf
+from absl.testing import parameterized
 
 from official.vision.beta.projects.yolo.losses import yolo_loss
 
diff --git a/official/vision/beta/projects/yolo/modeling/backbones/darknet.py b/official/vision/beta/projects/yolo/modeling/backbones/darknet.py
index 7c3086bd66d..d1fe0002af9 100644
--- a/official/vision/beta/projects/yolo/modeling/backbones/darknet.py
+++ b/official/vision/beta/projects/yolo/modeling/backbones/darknet.py
@@ -36,6 +36,7 @@
 """
 
 import collections
+
 import tensorflow as tf
 
 from official.modeling import hyperparams
@@ -225,7 +226,7 @@ def __call__(self, config, kwargs):
             False
         ],
         [
-            'DarkRes', 'csp', 1, True, 64, None, None, None, None, 'mish', -1,
+            'DarkRes', 'csp', 1, False, 64, None, None, None, None, 'mish', -1,
             1, 1, False
         ],
         [
diff --git a/official/vision/beta/projects/yolo/modeling/backbones/darknet_test.py b/official/vision/beta/projects/yolo/modeling/backbones/darknet_test.py
index 9441b06a311..a045768b9c6 100644
--- a/official/vision/beta/projects/yolo/modeling/backbones/darknet_test.py
+++ b/official/vision/beta/projects/yolo/modeling/backbones/darknet_test.py
@@ -15,12 +15,11 @@
 # Lint as: python3
 """Tests for yolo."""
 
-from absl.testing import parameterized
 import numpy as np
 import tensorflow as tf
+from absl.testing import parameterized
+from tensorflow.python.distribute import combinations, strategy_combinations
 
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import strategy_combinations
 from official.vision.beta.projects.yolo.modeling.backbones import darknet
 
 
diff --git a/official/vision/beta/projects/yolo/modeling/decoders/__init__.py b/official/vision/beta/projects/yolo/modeling/decoders/__init__.py
index e419af524b5..e04127d3fc8 100644
--- a/official/vision/beta/projects/yolo/modeling/decoders/__init__.py
+++ b/official/vision/beta/projects/yolo/modeling/decoders/__init__.py
@@ -11,4 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
diff --git a/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py b/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py
index 51c39098861..ac53f2e23f3 100644
--- a/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py
+++ b/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 """Feature Pyramid Network and Path Aggregation variants used in YOLO."""
-from typing import Mapping, Union, Optional
+from typing import Mapping, Optional, Union
 
 import tensorflow as tf
 
@@ -50,8 +50,18 @@
                 max_level_process_len=None,
                 csp_stack=7,
                 fpn_depth=7,
+                max_fpn_depth=5,
+                max_csp_stack=5,
                 path_process_len=8,
-                fpn_filter_scale=2),
+                fpn_filter_scale=1),
+            csp_xlarge=dict(
+                embed_spp=False,
+                use_fpn=True,
+                max_level_process_len=None,
+                csp_stack=7,
+                fpn_depth=7,
+                path_process_len=8,
+                fpn_filter_scale=1),
         ),
     'v3':
         dict(
@@ -68,9 +78,22 @@
             spp=dict(
                 embed_spp=True,
                 use_fpn=False,
-                max_level_process_len=2,
-                path_process_len=1),
+                max_level_process_len=None,
+                path_process_len=6),
         ),
+    'vx':
+        dict(
+            regular=dict(
+                embed_spp=True,
+                use_fpn=False,
+                max_level_process_len=None,
+                path_process_len=5),
+            fpn=dict(
+                embed_spp=True,
+                use_fpn=True,
+                max_level_process_len=None,
+                path_process_len=5),
+            ),
 }
 
 
@@ -87,6 +110,8 @@ class YoloFPN(tf.keras.layers.Layer):
 
   def __init__(self,
                fpn_depth=4,
+               max_fpn_depth=None,
+               max_csp_stack=None,
                use_spatial_attention=False,
                csp_stack=False,
                activation='leaky',
@@ -104,8 +129,12 @@ def __init__(self,
     Args:
       fpn_depth: `int`, number of layers to use in each FPN path
         if you choose to use an FPN.
+      max_fpn_depth: `int`, number of layers to use in each FPN path
+        if you choose to use an FPN along the largest FPN level.
       use_spatial_attention: `bool`, use the spatial attention module.
       csp_stack: `bool`, CSPize the FPN.
+      max_csp_stack: `int`, number of layers to use for CSP on the largest_path
+        only.
       activation: `str`, the activation function to use typically leaky or mish.
       fpn_filter_scale: `int`, scaling factor for the FPN filters.
       use_sync_bn: if True, use synchronized batch normalization.
@@ -121,6 +150,7 @@ def __init__(self,
 
     super().__init__(**kwargs)
     self._fpn_depth = fpn_depth
+    self._max_fpn_depth = max_fpn_depth or self._fpn_depth
 
     self._activation = activation
     self._use_sync_bn = use_sync_bn
@@ -133,6 +163,7 @@ def __init__(self,
     self._use_spatial_attention = use_spatial_attention
     self._filter_scale = fpn_filter_scale
     self._csp_stack = csp_stack
+    self._max_csp_stack = max_csp_stack or min(self._max_fpn_depth, csp_stack)
 
     self._base_config = dict(
         activation=self._activation,
@@ -184,6 +215,7 @@ def build(self, inputs):
 
     for level, depth in zip(
         reversed(range(self._min_level, self._max_level + 1)), self._depths):
+
       if level == self._min_level:
         self.resamples[str(level)] = nn_blocks.PathAggregationBlock(
             filters=depth // 2,
@@ -211,10 +243,10 @@ def build(self, inputs):
       else:
         self.preprocessors[str(level)] = nn_blocks.DarkRouteProcess(
             filters=depth,
-            repetitions=self._fpn_depth + 1 * int(self._csp_stack == 0),
+            repetitions=self._max_fpn_depth + 1 * int(self._csp_stack == 0),
             insert_spp=True,
             block_invert=False,
-            csp_stack=self._csp_stack,
+            csp_stack=min(self._csp_stack, self._max_fpn_depth),
             **self._base_config)
 
   def call(self, inputs):
@@ -349,13 +381,16 @@ def build(self, inputs):
       downsample = False
       upsample = True
 
-    if self._csp_stack == 0:
-      proc_filters = lambda x: x
-      resample_filters = lambda x: x // 2
-    else:
-      proc_filters = lambda x: x * 2
-      resample_filters = lambda x: x
     for level, depth in zip(self._iterator, self._depths):
+      if level > 5:
+        proc_filters = lambda x: x * 2
+        resample_filters = lambda x: x
+      elif self._csp_stack == 0:
+        proc_filters = lambda x: x
+        resample_filters = lambda x: x // 2
+      else:
+        proc_filters = lambda x: x * 2
+        resample_filters = lambda x: x
       if level == self._input:
         self.preprocessors[str(level)] = nn_blocks.DarkRouteProcess(
             filters=proc_filters(depth),
@@ -396,7 +431,7 @@ def get_raw_depths(self, minimum_depth, inputs):
     depths = []
     if len(inputs.keys()) > 3 or self._fpn_filter_scale > 1:
       for i in range(self._min_level, self._max_level + 1):
-        depths.append(inputs[str(i)][-1] * 2)
+        depths.append(inputs[str(i)][-1])
     else:
       for _ in range(self._min_level, self._max_level + 1):
         depths.append(minimum_depth)
@@ -429,6 +464,8 @@ def __init__(self,
                use_spatial_attention=False,
                csp_stack=False,
                fpn_depth=4,
+               max_fpn_depth=None,
+               max_csp_stack=None,
                fpn_filter_scale=1,
                path_process_len=6,
                max_level_process_len=None,
@@ -475,6 +512,8 @@ def __init__(self,
     self._input_specs = input_specs
     self._use_fpn = use_fpn
     self._fpn_depth = fpn_depth
+    self._max_fpn_depth = max_fpn_depth
+    self._max_csp_stack = max_csp_stack
     self._path_process_len = path_process_len
     self._max_level_process_len = max_level_process_len
     self._embed_spp = embed_spp
@@ -514,8 +553,10 @@ def __init__(self,
     }
     if self._use_fpn:
       inter_outs = YoloFPN(
-          fpn_depth=self._fpn_depth, **self._base_config)(
-              inputs)
+          fpn_depth=self._fpn_depth,
+          max_fpn_depth=self._max_fpn_depth,
+          max_csp_stack=self._max_csp_stack,
+          **self._base_config)(inputs)
       outputs = YoloPAN(**self._decoder_config)(inter_outs)
     else:
       inter_outs = None
diff --git a/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder_test.py b/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder_test.py
index 611c4585945..0e2f764ced7 100644
--- a/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder_test.py
+++ b/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder_test.py
@@ -15,13 +15,13 @@
 # Lint as: python3
 """Tests for YOLO."""
 
+import tensorflow as tf
 # Import libraries
 from absl.testing import parameterized
-import tensorflow as tf
+from tensorflow.python.distribute import combinations, strategy_combinations
 
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import strategy_combinations
-from official.vision.beta.projects.yolo.modeling.decoders import yolo_decoder as decoders
+from official.vision.beta.projects.yolo.modeling.decoders import \
+    yolo_decoder as decoders
 
 
 class YoloDecoderTest(parameterized.TestCase, tf.test.TestCase):
diff --git a/official/vision/beta/projects/yolo/modeling/factory.py b/official/vision/beta/projects/yolo/modeling/factory.py
index a841131062a..e6e81719dd1 100644
--- a/official/vision/beta/projects/yolo/modeling/factory.py
+++ b/official/vision/beta/projects/yolo/modeling/factory.py
@@ -15,13 +15,15 @@
 """Contains common factory functions yolo neural networks."""
 
 from absl import logging
+
 from official.vision.beta.modeling.backbones import factory as backbone_factory
 from official.vision.beta.modeling.decoders import factory as decoder_factory
-
 from official.vision.beta.projects.yolo.configs import yolo
 from official.vision.beta.projects.yolo.modeling import yolo_model
-from official.vision.beta.projects.yolo.modeling.heads import yolo_head
-from official.vision.beta.projects.yolo.modeling.layers import detection_generator
+from official.vision.beta.projects.yolo.modeling.heads import (yolo_head,
+                                                               yolox_head)
+from official.vision.beta.projects.yolo.modeling.layers import \
+    detection_generator
 
 
 def build_yolo_detection_generator(model_config: yolo.Yolo, anchor_boxes):
@@ -55,7 +57,9 @@ def build_yolo_head(input_specs, model_config: yolo.Yolo, l2_regularization):
   """Builds yolo head."""
   min_level = min(map(int, input_specs.keys()))
   max_level = max(map(int, input_specs.keys()))
-  head = yolo_head.YoloHead(
+
+  if isinstance(model_config, yolo.Yolox):
+    head = yolox_head.YoloxHead(
       min_level=min_level,
       max_level=max_level,
       classes=model_config.num_classes,
@@ -64,6 +68,16 @@ def build_yolo_head(input_specs, model_config: yolo.Yolo, l2_regularization):
       norm_epsilon=model_config.norm_activation.norm_epsilon,
       kernel_regularizer=l2_regularization,
       smart_bias=model_config.head.smart_bias)
+  else:
+    head = yolo_head.YoloHead(
+        min_level=min_level,
+        max_level=max_level,
+        classes=model_config.num_classes,
+        boxes_per_level=model_config.anchor_boxes.anchors_per_scale,
+        norm_momentum=model_config.norm_activation.norm_momentum,
+        norm_epsilon=model_config.norm_activation.norm_epsilon,
+        kernel_regularizer=l2_regularization,
+        smart_bias=model_config.head.smart_bias)
   return head
 
 
diff --git a/official/vision/beta/projects/yolo/modeling/heads/__init__.py b/official/vision/beta/projects/yolo/modeling/heads/__init__.py
index e419af524b5..e04127d3fc8 100644
--- a/official/vision/beta/projects/yolo/modeling/heads/__init__.py
+++ b/official/vision/beta/projects/yolo/modeling/heads/__init__.py
@@ -11,4 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
diff --git a/official/vision/beta/projects/yolo/modeling/heads/yolo_head.py b/official/vision/beta/projects/yolo/modeling/heads/yolo_head.py
index 23d41a045e8..7ead787434d 100644
--- a/official/vision/beta/projects/yolo/modeling/heads/yolo_head.py
+++ b/official/vision/beta/projects/yolo/modeling/heads/yolo_head.py
@@ -15,6 +15,7 @@
 """Yolo heads."""
 
 import tensorflow as tf
+
 from official.vision.beta.projects.yolo.modeling.layers import nn_blocks
 
 
diff --git a/official/vision/beta/projects/yolo/modeling/heads/yolo_head_test.py b/official/vision/beta/projects/yolo/modeling/heads/yolo_head_test.py
index 8c5414e5d84..4d5e4af3efb 100644
--- a/official/vision/beta/projects/yolo/modeling/heads/yolo_head_test.py
+++ b/official/vision/beta/projects/yolo/modeling/heads/yolo_head_test.py
@@ -15,11 +15,12 @@
 # Lint as: python3
 """Tests for yolo heads."""
 
+import tensorflow as tf
 # Import libraries
 from absl.testing import parameterized
-import tensorflow as tf
 
-from official.vision.beta.projects.yolo.modeling.heads import yolo_head as heads
+from official.vision.beta.projects.yolo.modeling.heads import \
+    yolo_head as heads
 
 
 class YoloDecoderTest(parameterized.TestCase, tf.test.TestCase):
diff --git a/official/vision/beta/projects/yolo/modeling/heads/yolox_head.py b/official/vision/beta/projects/yolo/modeling/heads/yolox_head.py
new file mode 100644
index 00000000000..c580372f352
--- /dev/null
+++ b/official/vision/beta/projects/yolo/modeling/heads/yolox_head.py
@@ -0,0 +1,232 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Yolox heads."""
+import tensorflow as tf
+from tensorflow.keras.models import Sequential
+
+from official.vision.beta.projects.yolo.modeling.layers import nn_blocks
+from official.vision.beta.projects.yolo.ops import box_ops
+
+
+class YoloxHead(tf.keras.layers.Layer):
+  """YOLOX Prediction Head."""
+
+  def __init__(
+      self,
+      min_level,
+      max_level,
+      classes=80,
+      boxes_per_level=1,
+      output_extras=0,
+      norm_momentum=0.99,
+      norm_epsilon=0.001,
+      kernel_initializer='VarianceScaling',
+      kernel_regularizer=None,
+      bias_regularizer=None,
+      activation='silu',
+      smart_bias=False,
+      use_separable_conv=False,
+      width_scaling=1.0,
+      prior_prob=1e-2,
+      **kwargs):
+
+    """YoloX Prediction Head initialization function.
+
+    Args:
+      min_level: `int`, the minimum backbone output level.
+      max_level: `int`, the maximum backbone output level.
+      classes: `int`, number of classes per category.
+      boxes_per_level: `int`, number of boxes to predict per level.
+      output_extras: `int`, number of additional output channels that the head.
+        should predict for non-object detection and non-image classification
+        tasks.
+      norm_momentum: `float`, normalization momentum for the moving average.
+      norm_epsilon: `float`, small float added to variance to avoid dividing by
+        zero.
+      kernel_initializer: kernel_initializer for convolutional layers.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
+      activation: `str`, the activation function to use. Default value: "silu".
+      smart_bias: `bool`, whether to use smart bias.
+      use_separable_conv: `bool` wether to use separable convs.
+      width_scaling: `float`, factor by which the filters should be scaled.
+      prior_prob: 'float', prior probability of custom value between 0.0 and 1. 
+        Defaults to 1e-2.
+      **kwargs: keyword arguments to be passed.
+    """
+
+    super().__init__(**kwargs)
+    self._min_level = min_level
+    self._max_level = max_level
+
+    self._key_list = [
+        str(key) for key in range(self._min_level, self._max_level + 1)
+    ]
+
+    self._classes = classes
+    self._boxes_per_level = boxes_per_level
+    self._output_extras = output_extras
+    self._width_scaling = width_scaling
+    self._smart_bias = smart_bias
+    self._use_separable_conv = use_separable_conv
+    self._prior_prob = prior_prob
+
+    self._stems = dict()
+
+    self._bias = -tf.math.log((1 - self._prior_prob) / self._prior_prob)
+
+    self._base_config = dict(
+        activation=activation,
+        norm_momentum=norm_momentum,
+        norm_epsilon=norm_epsilon,
+        kernel_initializer=kernel_initializer,
+        kernel_regularizer=kernel_regularizer,
+        bias_regularizer=bias_regularizer)
+
+
+
+  def build(self, input_shape):
+
+    self._cls_convs = dict()
+    self._reg_convs = dict()
+
+    self._cls_preds = dict()
+    self._reg_preds = dict()
+    self._obj_preds = dict()
+
+    self._cls_head = dict()
+    self._obj_head = dict()
+    self._reg_head = dict()
+
+    for k in self._key_list:
+      self._stems[k] = nn_blocks.ConvBN(
+          filters=int(256 * self._width_scaling),
+          kernel_size=(1, 1),
+          strides=(1, 1),
+          use_bn=True,
+          use_separable_conv=self._use_separable_conv,
+          **self._base_config,
+      )
+
+      self._cls_convs[k] = Sequential(
+          [
+              nn_blocks.ConvBN(
+                  filters=int(256 * self._width_scaling),
+                  kernel_size=(3, 3),
+                  strides=(1, 1),
+                  use_bn=True,
+                  use_separable_conv=self._use_separable_conv,
+                  **self._base_config,
+              ),
+              nn_blocks.ConvBN(
+                  filters=int(256 * self._width_scaling),
+                  kernel_size=(3, 3),
+                  strides=(1, 1),
+                  use_bn=True,
+                  use_separable_conv=self._use_separable_conv,
+                  **self._base_config,
+              ),
+          ]
+      )
+
+      self._reg_convs[k] = Sequential(
+          [
+              nn_blocks.ConvBN(
+                  filters=int(256 * self._width_scaling),
+                  kernel_size=(3, 3),
+                  strides=(1, 1),
+                  use_bn=True,
+                  use_separable_conv=self._use_separable_conv,
+                  **self._base_config,
+              ),
+              nn_blocks.ConvBN(
+                  filters=int(256 * self._width_scaling),
+                  kernel_size=(3, 3),
+                  strides=(1, 1),
+                  use_bn=True,
+                  use_separable_conv=self._use_separable_conv,
+                  **self._base_config,
+              ),
+          ]
+      )
+
+      self._cls_preds[k] = tf.keras.layers.Conv2D(
+          filters=self._boxes_per_level * self._classes,
+          kernel_size=(1, 1),
+          strides=(1, 1),
+          padding='same',
+          bias_initializer=tf.keras.initializers.constant(self._bias))
+
+      self._reg_preds[k] = tf.keras.layers.Conv2D(
+          filters=4,
+          kernel_size=(1, 1),
+          strides=(1, 1),
+          padding='same')
+
+      self._obj_preds[k] = tf.keras.layers.Conv2D(
+          filters=1 * self._boxes_per_level,
+          kernel_size=(1, 1),
+          strides=(1, 1),
+          padding='same',
+          bias_initializer=tf.keras.initializers.constant(self._bias))
+
+    for key in self._key_list:
+      self._cls_head[key] = Sequential()
+      self._cls_head[key].add(self._stems[key])
+      self._cls_head[key].add(self._cls_convs[key])
+      self._cls_head[key].add(self._cls_preds[key])
+
+      self._obj_head[key] = Sequential()
+      self._obj_head[key].add(self._stems[key])
+      self._obj_head[key].add(self._reg_convs[key])
+      self._obj_head[key].add(self._obj_preds[key])
+
+      self._reg_head[key] = Sequential()
+      self._reg_head[key].add(self._stems[key])
+      self._reg_head[key].add(self._reg_convs[key])
+      self._reg_head[key].add(self._reg_preds[key])
+
+  def call(self, inputs, *args, **kwargs):
+    outputs = dict()
+
+    for k in self._key_list:
+      ordered_preds = []
+      cls_output = self._cls_head[k](inputs[k])
+      reg_output = self._reg_head[k](inputs[k])
+      obj_output = self._obj_head[k](inputs[k])
+
+      for b in range(self._boxes_per_level):
+        ordered_preds.append(reg_output[:,:,:,4 * b: 4 * (b + 1)])
+        ordered_preds.append(obj_output[:,:,:,b: b + 1])
+        ordered_preds.append(cls_output[:,:,:,self._classes * b: self._classes * (b + 1)])
+      
+      output = tf.concat(ordered_preds, axis=-1)
+      outputs[k] = output
+    #Outputs are not flattened here.
+    return outputs
+
+  def get_config(self):
+      config = dict(
+          min_level=self._min_level,
+          max_level=self._max_level,
+          classes=self._classes,
+          boxes_per_level=self._boxes_per_level,
+          output_extras=self._output_extras)
+      return config
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
diff --git a/official/vision/beta/projects/yolo/modeling/heads/yolox_head_test.py b/official/vision/beta/projects/yolo/modeling/heads/yolox_head_test.py
new file mode 100644
index 00000000000..6f2068e7141
--- /dev/null
+++ b/official/vision/beta/projects/yolo/modeling/heads/yolox_head_test.py
@@ -0,0 +1,75 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for yolo heads."""
+
+import tensorflow as tf
+# Import libraries
+from absl.testing import parameterized
+
+from official.vision.beta.projects.yolo.modeling.heads import \
+    yolox_head as heads
+
+
+class YoloDecoderTest(parameterized.TestCase, tf.test.TestCase):
+
+  def test_network_creation(self):
+    """Test creation of YOLO family models."""
+    tf.keras.backend.set_image_data_format('channels_last')
+    input_shape = {
+        '3': [1, 52, 52, 256],
+        '4': [1, 26, 26, 512],
+        '5': [1, 13, 13, 1024]
+    }
+    classes = 100
+    bps = 3
+    head = heads.YoloxHead(3, 5, classes=classes, boxes_per_level=bps)
+
+    inputs = {}
+    for key in input_shape:
+      inputs[key] = tf.ones(input_shape[key], dtype=tf.float32)
+
+    endpoints = head(inputs)
+    # print(endpoints)
+
+    for key in endpoints.keys():
+      expected_input_shape = input_shape[key]
+      expected_input_shape[-1] = (classes + 5) * bps
+      self.assertAllEqual(endpoints[key].shape.as_list(), expected_input_shape)
+
+  def test_serialize_deserialize(self):
+    # Create a network object that sets all of its config options.
+    tf.keras.backend.set_image_data_format('channels_last')
+    input_shape = {
+        '3': [1, 52, 52, 256],
+        '4': [1, 26, 26, 512],
+        '5': [1, 13, 13, 1024]
+    }
+    classes = 100
+    bps = 3
+    head = heads.YoloxHead(3, 5, classes=classes, boxes_per_level=bps)
+
+    inputs = {}
+    for key in input_shape:
+      inputs[key] = tf.ones(input_shape[key], dtype=tf.float32)
+
+    _ = head(inputs)
+    configs = head.get_config()
+    head_from_config = heads.YoloxHead.from_config(configs)
+    self.assertAllEqual(head.get_config(), head_from_config.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/official/vision/beta/projects/yolo/modeling/layers/detection_generator.py b/official/vision/beta/projects/yolo/modeling/layers/detection_generator.py
index 68d70bdb978..ed0e26a6172 100644
--- a/official/vision/beta/projects/yolo/modeling/layers/detection_generator.py
+++ b/official/vision/beta/projects/yolo/modeling/layers/detection_generator.py
@@ -17,8 +17,7 @@
 
 from official.vision.beta.modeling.layers import detection_generator
 from official.vision.beta.projects.yolo.losses import yolo_loss
-from official.vision.beta.projects.yolo.ops import box_ops
-from official.vision.beta.projects.yolo.ops import loss_utils
+from official.vision.beta.projects.yolo.ops import box_ops, loss_utils
 
 
 @tf.keras.utils.register_keras_serializable(package='yolo')
diff --git a/official/vision/beta/projects/yolo/modeling/layers/detection_generator_test.py b/official/vision/beta/projects/yolo/modeling/layers/detection_generator_test.py
index ebe70060427..5d0d2b03a21 100644
--- a/official/vision/beta/projects/yolo/modeling/layers/detection_generator_test.py
+++ b/official/vision/beta/projects/yolo/modeling/layers/detection_generator_test.py
@@ -13,10 +13,11 @@
 # limitations under the License.
 
 """Tests for yolo detection generator."""
-from absl.testing import parameterized
 import tensorflow as tf
+from absl.testing import parameterized
 
-from official.vision.beta.projects.yolo.modeling.layers import detection_generator as dg
+from official.vision.beta.projects.yolo.modeling.layers import \
+    detection_generator as dg
 
 
 class YoloDecoderTest(parameterized.TestCase, tf.test.TestCase):
diff --git a/official/vision/beta/projects/yolo/modeling/layers/nn_blocks.py b/official/vision/beta/projects/yolo/modeling/layers/nn_blocks.py
index 5fc98ea2f63..d5d0851114b 100644
--- a/official/vision/beta/projects/yolo/modeling/layers/nn_blocks.py
+++ b/official/vision/beta/projects/yolo/modeling/layers/nn_blocks.py
@@ -11,11 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Contains common building blocks for yolo neural networks."""
 from typing import Callable, List, Tuple
 
 import tensorflow as tf
+import tensorflow.keras.backend as K
+from tensorflow.keras import Sequential
+from tensorflow.keras.layers import (Activation, BatchNormalization, Conv2D,
+                                     Layer, LeakyReLU, MaxPool2D, ReLU,
+                                     UpSampling2D, concatenate)
 
 from official.modeling import tf_utils
 from official.vision.beta.ops import spatial_transform_ops
@@ -1725,3 +1729,28 @@ def call(self, x, training=None):
         x[..., 1::2, 1::2, :]
     ],
                      axis=-1)
+
+class SiLU(Layer):
+  def __init__(self, *args, **kwargs):
+    super(SiLU, self).__init__(*args, **kwargs)
+
+  def call(self, x, **kwargs):
+    return x * K.sigmoid(x)
+
+  def get_config(self):
+    config = super(SiLU, self).get_config()
+    return config
+
+  def compute_output_shape(self, input_shape):
+    return input_shape
+
+def get_activation(name="silu"):
+  if name == "silu":
+    module = SiLU()
+  elif name == "relu":
+    module = ReLU()
+  elif name == "lrelu":
+    module = LeakyReLU(0.1)
+  else:
+    raise AttributeError("Unsupported act type: {}".format(name))
+  return module
diff --git a/official/vision/beta/projects/yolo/modeling/layers/nn_blocks_test.py b/official/vision/beta/projects/yolo/modeling/layers/nn_blocks_test.py
index b43beefba60..71452792eb9 100644
--- a/official/vision/beta/projects/yolo/modeling/layers/nn_blocks_test.py
+++ b/official/vision/beta/projects/yolo/modeling/layers/nn_blocks_test.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Lint as: python3
-from absl.testing import parameterized
 import numpy as np
 import tensorflow as tf
+# Lint as: python3
+from absl.testing import parameterized
 
 from official.vision.beta.projects.yolo.modeling.layers import nn_blocks
 
diff --git a/official/vision/beta/projects/yolo/modeling/yolo_model.py b/official/vision/beta/projects/yolo/modeling/yolo_model.py
index 06f79750ea8..44129a2df55 100644
--- a/official/vision/beta/projects/yolo/modeling/yolo_model.py
+++ b/official/vision/beta/projects/yolo/modeling/yolo_model.py
@@ -15,7 +15,9 @@
 """Yolo models."""
 
 from typing import Mapping, Union
+
 import tensorflow as tf
+
 from official.vision.beta.projects.yolo.modeling.layers import nn_blocks
 
 
diff --git a/official/vision/beta/projects/yolo/ops/__init__.py b/official/vision/beta/projects/yolo/ops/__init__.py
index a25710c222e..e04127d3fc8 100644
--- a/official/vision/beta/projects/yolo/ops/__init__.py
+++ b/official/vision/beta/projects/yolo/ops/__init__.py
@@ -11,5 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-
diff --git a/official/vision/beta/projects/yolo/ops/anchor.py b/official/vision/beta/projects/yolo/ops/anchor.py
index dfe675984a7..3aa6b09c738 100644
--- a/official/vision/beta/projects/yolo/ops/anchor.py
+++ b/official/vision/beta/projects/yolo/ops/anchor.py
@@ -16,13 +16,10 @@
 import numpy as np
 import tensorflow as tf
 
-from official.vision.beta.projects.yolo.ops import box_ops
-from official.vision.beta.projects.yolo.ops import loss_utils
-from official.vision.beta.projects.yolo.ops import preprocessing_ops
+from yolo.ops import box_ops, loss_utils, preprocessing_ops
 
 INF = 10000000
 
-
 def get_best_anchor(y_true,
                     anchors,
                     stride,
@@ -32,12 +29,11 @@ def get_best_anchor(y_true,
                     best_match_only=False,
                     use_tie_breaker=True):
   """Get the correct anchor that is assoiciated with each box using IOU.
-
+ 
   Args:
     y_true: tf.Tensor[] for the list of bounding boxes in the yolo format.
-    anchors: list or tensor for the anchor boxes to be used in prediction found
-      via Kmeans.
-    stride: `int` stride for the anchors.
+    anchors: list or tensor for the anchor boxes to be used in prediction
+      found via Kmeans.
     width: int for the image width.
     height: int for the image height.
     iou_thresh: `float` the minimum iou threshold to use for selecting boxes for
@@ -46,11 +42,11 @@ def get_best_anchor(y_true,
       the iou threshold, when set to True, this match will be dropped as no
       anchors can be linked to it.
     use_tie_breaker: `bool` if there is many anchors for a given box, then
-      attempt to use all of them, if False, only the first matching box will be
-      used.
-  Returns:
-    tf.Tensor: y_true with the anchor associated with each ground truth box
-      known
+      attempt to use all of them, if False, only the first matching box will
+      be used.
+  Return:
+    tf.Tensor: y_true with the anchor associated with each ground truth
+    box known
   """
   with tf.name_scope('get_best_anchor'):
     width = tf.cast(width, dtype=tf.float32)
@@ -61,7 +57,7 @@ def get_best_anchor(y_true,
     true_wh = tf.cast(y_true[..., 2:4], dtype=tf.float32) * scaler
 
     # scale down from large anchor to small anchor type
-    anchors = tf.cast(anchors, dtype=tf.float32) / stride
+    anchors = tf.cast(anchors, dtype=tf.float32)/stride
 
     k = tf.shape(anchors)[0]
 
@@ -93,7 +89,9 @@ def get_best_anchor(y_true,
           iou_type=3,
       )
       values, indexes = tf.math.top_k(
-          iou_raw, k=tf.cast(k, dtype=tf.int32), sorted=True)
+          iou_raw,
+          k=tf.cast(k, dtype=tf.int32),
+          sorted=True)
       ind_mask = tf.cast(values >= iou_thresh, dtype=indexes.dtype)
 
     # pad the indexs such that all values less than the thresh are -1
@@ -104,21 +102,16 @@ def get_best_anchor(y_true,
     elif use_tie_breaker:
       iou_index = tf.concat([
           tf.expand_dims(indexes[..., 0], axis=-1),
-          ((indexes[..., 1:] + 1) * ind_mask[..., 1:]) - 1
-      ],
-                            axis=-1)
+          ((indexes[..., 1:] + 1) * ind_mask[..., 1:]) - 1], axis=-1)
     else:
       iou_index = tf.concat([
           tf.expand_dims(indexes[..., 0], axis=-1),
-          tf.zeros_like(indexes[..., 1:]) - 1
-      ],
-                            axis=-1)
+          tf.zeros_like(indexes[..., 1:]) - 1], axis=-1)
 
   return tf.cast(iou_index, dtype=tf.float32), tf.cast(values, dtype=tf.float32)
 
-
 class YoloAnchorLabeler:
-  """Anchor labeler for the Yolo Models."""
+  """Anchor labeler for the Yolo Models"""
 
   def __init__(self,
                anchors=None,
@@ -138,8 +131,8 @@ def __init__(self,
       anchor_free_level_limits: `List` the box sizes that will be allowed at
         each FPN level as is done in the FCOS and YOLOX paper for anchor free
         box assignment.
-      level_strides: `Dict[int]` for how much the model scales down the images
-        at the each level.
+      level_strides: `Dict[int]` for how much the model scales down the
+        images at the each level.
       center_radius: `Dict[float]` for radius around each box center to search
         for extra centers in each level.
       max_num_instances: `int` for the number of boxes to compute loss on.
@@ -159,8 +152,9 @@ def __init__(self,
     """
     self.anchors = anchors
     self.masks = self._get_mask()
+    self.use_tie_breaker = use_tie_breaker
     self.anchor_free_level_limits = self._get_level_limits(
-        anchor_free_level_limits)
+      anchor_free_level_limits)
 
     if darknet and self.anchor_free_level_limits is None:
       center_radius = None
@@ -172,7 +166,7 @@ def __init__(self,
       self.num_instances = {key: maxim for key in self.keys}
     elif not darknet:
       self.num_instances = {
-          key: (6 - i) * max_num_instances for i, key in enumerate(self.keys)
+        key: (6 - i) * max_num_instances for i, key in enumerate(self.keys)
       }
     else:
       self.num_instances = {key: max_num_instances for key in self.keys}
@@ -181,7 +175,6 @@ def __init__(self,
     self.level_strides = level_strides
     self.match_threshold = match_threshold
     self.best_matches_only = best_matches_only
-    self.use_tie_breaker = use_tie_breaker
     self.dtype = dtype
 
   def _get_mask(self):
@@ -203,8 +196,20 @@ def _get_level_limits(self, level_limits):
       level_limits_dict = {}
       level_limits = [0.0] + level_limits + [np.inf]
 
+      k = 0
       for i, key in enumerate(self.anchors.keys()):
-        level_limits_dict[key] = level_limits[i:i + 2]
+        level_limits_dict[key] = []
+
+        base = k
+        for j, lst in enumerate(self.anchors[key]):
+          # level_limits_dict[key].append(level_limits[k:k + 2])
+          if self.use_tie_breaker:
+            base = k
+          level_limits_dict[key].append([level_limits[base],
+                                        level_limits[k + 1]])
+          k += 1
+        level_limits_dict[key] = tf.convert_to_tensor(level_limits_dict[key])
+        print(level_limits_dict)
     else:
       level_limits_dict = None
     return level_limits_dict
@@ -225,29 +230,19 @@ def _tie_breaking_search(self, anchors, mask, boxes, classes):
     anchor_id = tf.cast(anchor_id, boxes.dtype)
     return boxes, classes, anchor_id
 
-  def _get_anchor_id(self,
-                     key,
-                     boxes,
-                     classes,
-                     width,
-                     height,
-                     stride,
+  def _get_anchor_id(self, key, boxes, classes, width, height, stride,
                      iou_index=None):
-    """Find the object anchor assignments in an anchor based paradigm."""
+    """Find the object anchor assignments in an anchor based paradigm. """
 
     # find the best anchor
     anchors = self.anchors[key]
     num_anchors = len(anchors)
     if self.best_matches_only:
       # get the best anchor for each box
-      iou_index, _ = get_best_anchor(
-          boxes,
-          anchors,
-          stride,
-          width=width,
-          height=height,
-          best_match_only=True,
-          iou_thresh=self.match_threshold)
+      iou_index, _ = get_best_anchor(boxes, anchors, stride,
+                                     width=width, height=height,
+                                     best_match_only=True,
+                                     iou_thresh=self.match_threshold)
       mask = range(num_anchors)
     else:
       # search is done across FPN levels, get the mask of anchor indexes
@@ -255,12 +250,12 @@ def _get_anchor_id(self,
       mask = self.masks[key]
 
     # search for the correct box to use
-    (boxes, classes,
-     anchors) = self._tie_breaking_search(iou_index, mask, boxes, classes)
+    (boxes, classes, anchors) = self._tie_breaking_search(iou_index, mask,
+                                                          boxes, classes)
     return boxes, classes, anchors, num_anchors
 
   def _get_centers(self, boxes, classes, anchors, width, height, scale_xy):
-    """Find the object center assignments in an anchor based paradigm."""
+    """Find the object center assignments in an anchor based paradigm. """
     offset = tf.cast(0.5 * (scale_xy - 1), boxes.dtype)
 
     grid_xy, _ = tf.split(boxes, 2, axis=-1)
@@ -270,18 +265,20 @@ def _get_centers(self, boxes, classes, anchors, width, height, scale_xy):
     centers = tf.math.floor(grid_xy)
 
     if offset != 0.0:
-      clamp = lambda x, ma: tf.maximum(  # pylint:disable=g-long-lambda
+      clamp = lambda x, ma: tf.maximum(
           tf.minimum(x, tf.cast(ma, x.dtype)), tf.zeros_like(x))
 
       grid_xy_index = grid_xy - centers
       positive_shift = ((grid_xy_index < offset) & (grid_xy > 1.))
-      negative_shift = ((grid_xy_index > (1 - offset)) & (grid_xy <
-                                                          (wh_scale - 1.)))
+      negative_shift = (
+          (grid_xy_index > (1 - offset)) & (grid_xy < (wh_scale - 1.)))
 
       zero, _ = tf.split(tf.ones_like(positive_shift), 2, axis=-1)
-      shift_mask = tf.concat([zero, positive_shift, negative_shift], axis=-1)
-      offset = tf.cast([[0, 0], [1, 0], [0, 1], [-1, 0], [0, -1]],
-                       offset.dtype) * offset
+      shift_mask = tf.concat(
+          [zero, positive_shift, negative_shift], axis=-1)
+      offset = tf.cast([[0, 0], [1, 0],
+                        [0, 1], [-1, 0],
+                        [0, -1]], offset.dtype) * offset
 
       num_shifts = tf.shape(shift_mask)
       num_shifts = num_shifts[-1]
@@ -294,22 +291,24 @@ def _get_centers(self, boxes, classes, anchors, width, height, scale_xy):
       shift_ind = shift_ind - (1 - shift_mask)
       shift_ind = tf.expand_dims(shift_ind, axis=-1)
 
-      boxes_and_centers = tf.concat([boxes, classes, anchors, shift_ind],
-                                    axis=-1)
+      boxes_and_centers = tf.concat(
+          [boxes, classes, anchors, shift_ind], axis=-1)
       boxes_and_centers = tf.reshape(boxes_and_centers, [-1, 7])
       _, center_ids = tf.split(boxes_and_centers, [6, 1], axis=-1)
 
+      #center_ids = tf.squeeze(center_ids, axis = -1)
       select = tf.where(center_ids >= 0)
       select, _ = tf.split(select, 2, axis=-1)
 
       boxes_and_centers = tf.gather_nd(boxes_and_centers, select)
 
+      # center_ids = tf.cast(center_ids, tf.int32)
       center_ids = tf.gather_nd(center_ids, select)
       center_ids = tf.cast(center_ids, tf.int32)
       shifts = tf.gather_nd(offset, center_ids)
 
-      boxes, classes, anchors, _ = tf.split(
-          boxes_and_centers, [4, 1, 1, 1], axis=-1)
+      boxes, classes, anchors, _ = tf.split(boxes_and_centers,
+                                            [4, 1, 1, 1], axis=-1)
       grid_xy, _ = tf.split(boxes, 2, axis=-1)
       centers = tf.math.floor(grid_xy * wh_scale - shifts)
       centers = clamp(centers, wh_scale - 1)
@@ -318,7 +317,13 @@ def _get_centers(self, boxes, classes, anchors, width, height, scale_xy):
     centers = tf.cast(tf.concat([y, x, anchors], axis=-1), tf.int32)
     return boxes, classes, centers
 
-  def _get_anchor_free(self, key, boxes, classes, height, width, stride,
+  def _get_anchor_free(self,
+                       key, 
+                       boxes,
+                       classes,
+                       height,
+                       width,
+                       stride,
                        center_radius):
     """Find the box assignements in an anchor free paradigm."""
     level_limits = self.anchor_free_level_limits[key]
@@ -327,6 +332,7 @@ def _get_anchor_free(self, key, boxes, classes, height, width, stride,
     grid_points = tf.squeeze(grid_points, axis=0)
     box_list = boxes
     class_list = classes
+    num_anchors = 1
 
     grid_points = (grid_points + 0.5) * stride
     x_centers, y_centers = grid_points[..., 0], grid_points[..., 1]
@@ -345,13 +351,18 @@ def _get_anchor_free(self, key, boxes, classes, height, width, stride,
     b_b = tlbr_boxes[..., 2] - y_centers
     b_r = tlbr_boxes[..., 3] - x_centers
     box_delta = tf.stack([b_t, b_l, b_b, b_r], axis=-1)
+    is_in_boxes = tf.reduce_min(box_delta, axis=-1) > 0.0
     if level_limits is not None:
       max_reg_targets_per_im = tf.reduce_max(box_delta, axis=-1)
-      gt_min = max_reg_targets_per_im >= level_limits[0]
-      gt_max = max_reg_targets_per_im <= level_limits[1]
-      is_in_boxes = tf.logical_and(gt_min, gt_max)
-    else:
-      is_in_boxes = tf.reduce_min(box_delta, axis=-1) > 0.0
+      level_limits = tf.cast(level_limits, max_reg_targets_per_im.dtype)
+      num_anchors = tf.shape(level_limits)[0]
+
+      max_reg_targets_per_im = tf.expand_dims(max_reg_targets_per_im, axis=-1)
+
+      gt_min = max_reg_targets_per_im >= level_limits[..., 0]
+      gt_max = max_reg_targets_per_im <= level_limits[..., 1]
+      is_in_level = tf.logical_and(gt_min, gt_max)
+
     is_in_boxes_all = tf.reduce_any(is_in_boxes, axis=(0, 1), keepdims=True)
 
     # check if the center is in the receptive feild of the this fpn level
@@ -368,17 +379,22 @@ def _get_anchor_free(self, key, boxes, classes, height, width, stride,
     is_in_boxes_and_center = tf.logical_and(is_in_boxes, is_in_centers)
     is_in_boxes_and_center = tf.logical_and(is_in_index, is_in_boxes_and_center)
 
-    if self.use_tie_breaker:
-      boxes_all = tf.cast(is_in_boxes_and_center, area.dtype)
-      boxes_all = ((boxes_all * area) + ((1 - boxes_all) * INF))
-      boxes_min = tf.reduce_min(boxes_all, axis=-1, keepdims=True)
-      boxes_min = tf.where(boxes_min == INF, -1.0, boxes_min)
-      is_in_boxes_and_center = boxes_all == boxes_min
+    if level_limits is not None:
+      is_in_boxes_and_center = tf.expand_dims(is_in_boxes_and_center, axis=-1)
+      is_in_boxes_and_center = tf.logical_and(is_in_level,
+                                              is_in_boxes_and_center)
+
+    # if self.use_tie_breaker:
+    #   boxes_all = tf.cast(is_in_boxes_and_center, area.dtype)
+    #   boxes_all = ((boxes_all * area) + ((1 - boxes_all) * INF))
+    #   boxes_min = tf.reduce_min(boxes_all, axis = -1, keepdims = True)
+    #   boxes_min = tf.where(boxes_min == INF, -1.0, boxes_min)
+    #   is_in_boxes_and_center = boxes_all == boxes_min
 
     # construct the index update grid
-    reps = tf.reduce_sum(tf.cast(is_in_boxes_and_center, tf.int16), axis=-1)
+    reps = tf.reduce_sum(tf.cast(is_in_boxes_and_center, tf.int16), axis=-2)
     indexes = tf.cast(tf.where(is_in_boxes_and_center), tf.int32)
-    y, x, t = tf.split(indexes, 3, axis=-1)
+    y, x, t, a = tf.split(indexes, 4, axis=-1)
 
     boxes = tf.gather_nd(box_list, t)
     classes = tf.cast(tf.gather_nd(class_list, t), boxes.dtype)
@@ -389,8 +405,8 @@ def _get_anchor_free(self, key, boxes, classes, height, width, stride,
 
     # return the samples and the indexes
     samples = tf.concat([boxes, conf, classes], axis=-1)
-    indexes = tf.concat([y, x, tf.zeros_like(t)], axis=-1)
-    return indexes, samples
+    indexes = tf.concat([y, x, a], axis=-1)
+    return indexes, samples, num_anchors
 
   def build_label_per_path(self,
                            key,
@@ -403,20 +419,23 @@ def build_label_per_path(self,
     stride = self.level_strides[key]
     scale_xy = self.center_radius[key] if self.center_radius is not None else 1
 
-    width = tf.cast(width // stride, boxes.dtype)
-    height = tf.cast(height // stride, boxes.dtype)
+    width = tf.cast(width//stride, boxes.dtype)
+    height = tf.cast(height//stride, boxes.dtype)
 
     if self.anchor_free_level_limits is None:
-      (boxes, classes, anchors, num_anchors) = self._get_anchor_id(
-          key, boxes, classes, width, height, stride, iou_index=iou_index)
+      (boxes, classes,
+       anchors, num_anchors) = self._get_anchor_id(key, boxes, classes,
+                                                   width, height, stride,
+                                                   iou_index=iou_index)
       boxes, classes, centers = self._get_centers(boxes, classes, anchors,
                                                   width, height, scale_xy)
       ind_mask = tf.ones_like(classes)
       updates = tf.concat([boxes, ind_mask, classes], axis=-1)
     else:
-      num_anchors = 1
-      (centers, updates) = self._get_anchor_free(key, boxes, classes, height,
-                                                 width, stride, scale_xy)
+      (centers, updates, num_anchors) = self._get_anchor_free(key, boxes,
+                                                              classes, height,
+                                                              width, stride,
+                                                              scale_xy)
       boxes, ind_mask, classes = tf.split(updates, [4, 1, 1], axis=-1)
 
     width = tf.cast(width, tf.int32)
@@ -438,17 +457,17 @@ def __call__(self, boxes, classes, width, height):
     """Builds the labels for a single image, not functional in batch mode.
 
     Args:
-      boxes: `Tensor` of shape [None, 4] indicating the object locations in an
-        image.
+      boxes: `Tensor` of shape [None, 4] indicating the object locations in
+        an image.
       classes: `Tensor` of shape [None] indicating the each objects classes.
       width: `int` for the images width.
       height: `int` for the images height.
-
+      num_instances: `int` for the maximum number of expanded boxes to allow.
     Returns:
       centers: `Tensor` of shape [None, 3] of indexes in the final grid where
         boxes are located.
       updates: `Tensor` of shape [None, 8] the value to place in the final grid.
-      full: `Tensor` of [width/stride, height/stride, num_anchors, 1] holding
+      full: `Tensor` of [width/stride, height/stride, num_anchors, 2] holding
         a mask of where boxes are locates for confidence losses.
     """
     indexes = {}
@@ -465,15 +484,11 @@ def __call__(self, boxes, classes, width, height):
 
       stride = tf.cast([width, height], boxes.dtype)
       # get the best anchor for each box
-      iou_index, _ = get_best_anchor(
-          boxes,
-          anchorsvec,
-          stride,
-          width=1.0,
-          height=1.0,
-          best_match_only=False,
-          use_tie_breaker=self.use_tie_breaker,
-          iou_thresh=self.match_threshold)
+      iou_index, _ = get_best_anchor(boxes, anchorsvec, stride,
+                                     width=1.0, height=1.0,
+                                     best_match_only=False,
+                                     use_tie_breaker=self.use_tie_breaker,
+                                     iou_thresh=self.match_threshold)
 
     for key in self.keys:
       indexes[key], updates[key], true_grids[key] = self.build_label_per_path(
diff --git a/official/vision/beta/projects/yolo/ops/box_ops.py b/official/vision/beta/projects/yolo/ops/box_ops.py
index 6d15f5d3157..968cb448522 100644
--- a/official/vision/beta/projects/yolo/ops/box_ops.py
+++ b/official/vision/beta/projects/yolo/ops/box_ops.py
@@ -14,7 +14,9 @@
 
 """Yolo box ops."""
 import math
+
 import tensorflow as tf
+
 from official.vision.beta.projects.yolo.ops import math_ops
 
 
diff --git a/official/vision/beta/projects/yolo/ops/box_ops_test.py b/official/vision/beta/projects/yolo/ops/box_ops_test.py
index afba1ee53c1..f0333209759 100644
--- a/official/vision/beta/projects/yolo/ops/box_ops_test.py
+++ b/official/vision/beta/projects/yolo/ops/box_ops_test.py
@@ -13,9 +13,9 @@
 # limitations under the License.
 
 """box_ops tests."""
-from absl.testing import parameterized
 import numpy as np
 import tensorflow as tf
+from absl.testing import parameterized
 
 from official.vision.beta.projects.yolo.ops import box_ops
 
diff --git a/official/vision/beta/projects/yolo/ops/loss_utils.py b/official/vision/beta/projects/yolo/ops/loss_utils.py
index 5536290199b..83946cd6a19 100755
--- a/official/vision/beta/projects/yolo/ops/loss_utils.py
+++ b/official/vision/beta/projects/yolo/ops/loss_utils.py
@@ -17,8 +17,7 @@
 import numpy as np
 import tensorflow as tf
 
-from official.vision.beta.projects.yolo.ops import box_ops
-from official.vision.beta.projects.yolo.ops import math_ops
+from official.vision.beta.projects.yolo.ops import box_ops, math_ops
 
 
 @tf.custom_gradient
diff --git a/official/vision/beta/projects/yolo/ops/mosaic.py b/official/vision/beta/projects/yolo/ops/mosaic.py
index 0ab68c2a6c9..c5be1d2ac45 100755
--- a/official/vision/beta/projects/yolo/ops/mosaic.py
+++ b/official/vision/beta/projects/yolo/ops/mosaic.py
@@ -14,11 +14,11 @@
 
 """Mosaic op."""
 import random
+
 import tensorflow as tf
 import tensorflow_addons as tfa
 
-from official.vision.beta.ops import box_ops
-from official.vision.beta.ops import preprocess_ops
+from official.vision.beta.ops import box_ops, preprocess_ops
 from official.vision.beta.projects.yolo.ops import preprocessing_ops
 
 
@@ -179,7 +179,7 @@ def _augment_image(self,
         infos,
         area_thresh=self._area_thresh,
         shuffle_boxes=False,
-        augment=True,
+        filter_and_clip_boxes=True,
         seed=self._seed)
     classes, is_crowd, area = self._select_ind(inds, classes, is_crowd, area)  # pylint:disable=unbalanced-tuple-unpacking
     return image, boxes, classes, is_crowd, area, crop_points
diff --git a/official/vision/beta/projects/yolo/ops/preprocessing_ops.py b/official/vision/beta/projects/yolo/ops/preprocessing_ops.py
index fc642d20c2b..2d870dea1d8 100755
--- a/official/vision/beta/projects/yolo/ops/preprocessing_ops.py
+++ b/official/vision/beta/projects/yolo/ops/preprocessing_ops.py
@@ -482,11 +482,15 @@ def cast(values, dtype):
     image_ = tf.pad(
         cropped_image, [[pad[0], pad[2]], [pad[1], pad[3]], [0, 0]],
         constant_values=PAD_VALUE)
+
+    # Pad and scale info
+    isize = tf.cast(tf.shape(image_)[:2], dtype=tf.float32)
+    osize = tf.cast((desired_size[0], desired_size[1]), dtype=tf.float32)
     pad_info = tf.stack([
         tf.cast(tf.shape(cropped_image)[:2], tf.float32),
-        tf.cast(tf.shape(image_)[:2], dtype=tf.float32),
-        tf.ones_like(original_dims, dtype=tf.float32),
-        (-tf.cast(pad[:2], tf.float32))
+        osize, 
+        osize/isize,
+        (-tf.cast(pad[:2], tf.float32)*osize/isize)
     ])
     infos.append(pad_info)
 
@@ -761,7 +765,9 @@ def boxes_candidates(clipped_boxes,
   Returns:
     indices[:, 0]: A `Tensor` representing valid boxes after filtering.
   """
-
+  if area_thr == 0.0:
+    wh_thr = 0
+    ar_thr = np.inf
   area_thr = tf.math.abs(area_thr)
 
   # Get the scaled and shifted heights of the original
@@ -778,8 +784,8 @@ def boxes_candidates(clipped_boxes,
                   clipped_height / (clipped_width + 1e-16))
 
   # Ensure the clipped width adn height are larger than a preset threshold.
-  conda = clipped_width > wh_thr
-  condb = clipped_height > wh_thr
+  conda = clipped_width >= wh_thr
+  condb = clipped_height >= wh_thr
 
   # Ensure the area of the clipped box is larger than the area threshold.
   area = (clipped_height * clipped_width) / (og_width * og_height + 1e-16)
@@ -837,7 +843,7 @@ def transform_and_clip_boxes(boxes,
                              shuffle_boxes=False,
                              area_thresh=0.1,
                              seed=None,
-                             augment=True):
+                             filter_and_clip_boxes=True):
   """Clips and cleans the boxes.
 
   Args:
@@ -868,8 +874,8 @@ def get_valid_boxes(boxes):
 
   # Make sure all boxes are valid to start, clip to [0, 1] and get only the
   # valid boxes.
-  output_size = tf.cast([640, 640], tf.float32)
-  if augment:
+  output_size = None
+  if filter_and_clip_boxes:
     boxes = tf.math.maximum(tf.math.minimum(boxes, 1.0), 0.0)
   cond = get_valid_boxes(boxes)
 
@@ -918,16 +924,17 @@ def get_valid_boxes(boxes):
   boxes *= tf.cast(tf.expand_dims(cond, axis=-1), boxes.dtype)
 
   # Threshold the existing boxes.
-  if augment:
-    boxes_ = bbox_ops.denormalize_boxes(boxes, output_size)
-    box_history_ = bbox_ops.denormalize_boxes(box_history, output_size)
-    inds = boxes_candidates(boxes_, box_history_, area_thr=area_thresh)
+  if filter_and_clip_boxes:
+    if output_size is not None:
+      boxes_ = bbox_ops.denormalize_boxes(boxes, output_size)
+      box_history_ = bbox_ops.denormalize_boxes(box_history, output_size)
+      inds = boxes_candidates(boxes_, box_history_, area_thr=area_thresh)
+    else:
+      inds = boxes_candidates(boxes, box_history, wh_thr = 0.0, area_thr=area_thresh)
     # Select and gather the good boxes.
     if shuffle_boxes:
       inds = tf.random.shuffle(inds, seed=seed)
   else:
-    boxes = box_history
-    boxes_ = bbox_ops.denormalize_boxes(boxes, output_size)
-    inds = bbox_ops.get_non_empty_box_indices(boxes_)
+    inds = bbox_ops.get_non_empty_box_indices(boxes)
   boxes = tf.gather(boxes, inds)
   return boxes, inds
diff --git a/official/vision/beta/projects/yolo/ops/preprocessing_ops_test.py b/official/vision/beta/projects/yolo/ops/preprocessing_ops_test.py
index 43cca574b7f..a5dba12fc4e 100755
--- a/official/vision/beta/projects/yolo/ops/preprocessing_ops_test.py
+++ b/official/vision/beta/projects/yolo/ops/preprocessing_ops_test.py
@@ -13,9 +13,9 @@
 # limitations under the License.
 
 """Tests for preprocessing_ops.py."""
-from absl.testing import parameterized
 import numpy as np
 import tensorflow as tf
+from absl.testing import parameterized
 
 from official.vision.beta.ops import box_ops as bbox_ops
 from official.vision.beta.projects.yolo.ops import preprocessing_ops
diff --git a/official/vision/beta/projects/yolo/optimization/__init__.py b/official/vision/beta/projects/yolo/optimization/__init__.py
index 6ff51c80648..46d5d5003b9 100755
--- a/official/vision/beta/projects/yolo/optimization/__init__.py
+++ b/official/vision/beta/projects/yolo/optimization/__init__.py
@@ -14,9 +14,10 @@
 
 """Optimization package definition."""
 
-# pylint: disable=wildcard-import
 from official.modeling.optimization.configs.learning_rate_config import *
-from official.modeling.optimization.ema_optimizer import ExponentialMovingAverage
+from official.modeling.optimization.ema_optimizer import \
+    ExponentialMovingAverage
 from official.vision.beta.projects.yolo.optimization.configs.optimization_config import *
 from official.vision.beta.projects.yolo.optimization.configs.optimizer_config import *
-from official.vision.beta.projects.yolo.optimization.optimizer_factory import OptimizerFactory as YoloOptimizerFactory
+from official.vision.beta.projects.yolo.optimization.optimizer_factory import \
+    OptimizerFactory as YoloOptimizerFactory
diff --git a/official/vision/beta/projects/yolo/optimization/configs/__init__.py b/official/vision/beta/projects/yolo/optimization/configs/__init__.py
index e419af524b5..e04127d3fc8 100755
--- a/official/vision/beta/projects/yolo/optimization/configs/__init__.py
+++ b/official/vision/beta/projects/yolo/optimization/configs/__init__.py
@@ -11,4 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
diff --git a/official/vision/beta/projects/yolo/optimization/configs/optimization_config.py b/official/vision/beta/projects/yolo/optimization/configs/optimization_config.py
index 92b8d1a79b1..64ba9e985c6 100755
--- a/official/vision/beta/projects/yolo/optimization/configs/optimization_config.py
+++ b/official/vision/beta/projects/yolo/optimization/configs/optimization_config.py
@@ -21,8 +21,10 @@
 import dataclasses
 from typing import Optional
 
-from official.modeling.optimization.configs import optimization_config as optimization_cfg
-from official.vision.beta.projects.yolo.optimization.configs import optimizer_config as opt_cfg
+from official.modeling.optimization.configs import \
+    optimization_config as optimization_cfg
+from official.vision.beta.projects.yolo.optimization.configs import \
+    optimizer_config as opt_cfg
 
 
 @dataclasses.dataclass
diff --git a/official/vision/beta/projects/yolo/optimization/optimizer_factory.py b/official/vision/beta/projects/yolo/optimization/optimizer_factory.py
index b2126d16bc2..bd3414215bb 100755
--- a/official/vision/beta/projects/yolo/optimization/optimizer_factory.py
+++ b/official/vision/beta/projects/yolo/optimization/optimizer_factory.py
@@ -16,8 +16,7 @@
 
 import gin
 
-from official.modeling.optimization import ema_optimizer
-from official.modeling.optimization import optimizer_factory
+from official.modeling.optimization import ema_optimizer, optimizer_factory
 from official.vision.beta.projects.yolo.optimization import sgd_torch
 
 optimizer_factory.OPTIMIZERS_CLS.update({
diff --git a/official/vision/beta/projects/yolo/optimization/sgd_torch.py b/official/vision/beta/projects/yolo/optimization/sgd_torch.py
index a79e5671aef..b5ea7e471fc 100644
--- a/official/vision/beta/projects/yolo/optimization/sgd_torch.py
+++ b/official/vision/beta/projects/yolo/optimization/sgd_torch.py
@@ -15,8 +15,8 @@
 """SGD PyTorch optimizer."""
 import re
 
-from absl import logging
 import tensorflow as tf
+from absl import logging
 
 LearningRateSchedule = tf.keras.optimizers.schedules.LearningRateSchedule
 
@@ -302,6 +302,7 @@ def get_config(self):
         "decay": self._initial_decay,
         "momentum": self._serialize_hyperparameter("momentum"),
         "momentum_start": self._serialize_hyperparameter("momentum_start"),
+        "weight_decay": self._serialize_hyperparameter("weight_decay"),
         "warmup_steps": self._serialize_hyperparameter("warmup_steps"),
         "nesterov": self.nesterov,
     })
diff --git a/official/vision/beta/projects/yolo/tasks/image_classification.py b/official/vision/beta/projects/yolo/tasks/image_classification.py
index 4edef631fce..1094afb6586 100644
--- a/official/vision/beta/projects/yolo/tasks/image_classification.py
+++ b/official/vision/beta/projects/yolo/tasks/image_classification.py
@@ -15,10 +15,11 @@
 """Image classification task definition."""
 from official.common import dataset_fn
 from official.core import task_factory
-from official.vision.beta.dataloaders import classification_input as classification_input_base
-from official.vision.beta.dataloaders import input_reader_factory
-from official.vision.beta.dataloaders import tfds_factory
-from official.vision.beta.projects.yolo.configs import darknet_classification as exp_cfg
+from official.vision.beta.dataloaders import \
+    classification_input as classification_input_base
+from official.vision.beta.dataloaders import input_reader_factory, tfds_factory
+from official.vision.beta.projects.yolo.configs import \
+    darknet_classification as exp_cfg
 from official.vision.beta.projects.yolo.dataloaders import classification_input
 from official.vision.beta.tasks import image_classification
 
diff --git a/official/vision/beta/projects/yolo/tasks/yolo.py b/official/vision/beta/projects/yolo/tasks/yolo.py
index 3683952a304..cd8641cc83f 100755
--- a/official/vision/beta/projects/yolo/tasks/yolo.py
+++ b/official/vision/beta/projects/yolo/tasks/yolo.py
@@ -17,25 +17,22 @@
 import collections
 from typing import Optional
 
-from absl import logging
 import tensorflow as tf
+from absl import logging
 
-from official.core import base_task
-from official.core import config_definitions
-from official.core import input_reader
-from official.core import task_factory
+from official.core import (base_task, config_definitions, input_reader,
+                           task_factory)
 from official.modeling import performance
-from official.vision.beta.dataloaders import tfds_factory
-from official.vision.beta.dataloaders import tf_example_label_map_decoder
+from official.vision.beta.dataloaders import (tf_example_label_map_decoder,
+                                              tfds_factory)
 from official.vision.beta.evaluation import coco_evaluator
 from official.vision.beta.ops import box_ops
 from official.vision.beta.projects.yolo import optimization
 from official.vision.beta.projects.yolo.configs import yolo as exp_cfg
-from official.vision.beta.projects.yolo.dataloaders import tf_example_decoder
-from official.vision.beta.projects.yolo.dataloaders import yolo_input
+from official.vision.beta.projects.yolo.dataloaders import (tf_example_decoder,
+                                                            yolo_input)
 from official.vision.beta.projects.yolo.modeling import factory
-from official.vision.beta.projects.yolo.ops import mosaic
-from official.vision.beta.projects.yolo.ops import preprocessing_ops
+from official.vision.beta.projects.yolo.ops import mosaic, preprocessing_ops
 from official.vision.beta.projects.yolo.tasks import task_utils
 
 OptimizationConfig = optimization.OptimizationConfig
@@ -255,16 +252,22 @@ def train_step(self, inputs, model, optimizer, metrics=None):
         logs.update({m.name: m.result()})
     return logs
 
-  def _reorg_boxes(self, boxes, num_detections, image):
+  def _reorg_boxes(self, boxes, info, num_detections):
     """Scale and Clean boxes prior to Evaluation."""
-
-    # Build a prediciton mask to take only the number of detections
     mask = tf.sequence_mask(num_detections, maxlen=tf.shape(boxes)[1])
-    mask = tf.cast(tf.expand_dims(mask, axis=-1), boxes.dtype)
+    mask = tf.cast(tf.expand_dims(mask, axis = -1), boxes.dtype)
 
     # Denormalize the boxes by the shape of the image
-    inshape = tf.cast(preprocessing_ops.get_image_shape(image), boxes.dtype)
+    inshape = tf.expand_dims(info[:, 1, :], axis = 1)
+    ogshape = tf.expand_dims(info[:, 0, :], axis = 1)
+    scale = tf.expand_dims(info[:, 2, :], axis = 1)
+    offset = tf.expand_dims(info[:, 3, :], axis = 1)
+
     boxes = box_ops.denormalize_boxes(boxes, inshape)
+    boxes = box_ops.clip_boxes(boxes, inshape)
+    boxes += tf.tile(offset, [1, 1, 2])
+    boxes /= tf.tile(scale, [1, 1, 2])
+    boxes = box_ops.clip_boxes(boxes, ogshape)
 
     # Mask the boxes for usage
     boxes *= mask
@@ -292,10 +295,8 @@ def validation_step(self, inputs, model, metrics=None):
     logs = {self.loss: metric_loss}
 
     # Reorganize and rescale the boxes
-    boxes = self._reorg_boxes(y_pred['bbox'], y_pred['num_detections'], image)
-    label['groundtruths']['boxes'] = self._reorg_boxes(
-        label['groundtruths']['boxes'], label['groundtruths']['num_detections'],
-        image)
+    info = label['groundtruths']['image_info']
+    boxes = self._reorg_boxes(y_pred['bbox'], info, y_pred["num_detections"])
 
     # Build the input for the coc evaluation metric
     coco_model_outputs = {
diff --git a/official/vision/beta/projects/yolo/train.py b/official/vision/beta/projects/yolo/train.py
index 78ee1ac32ae..09e97ab74ba 100644
--- a/official/vision/beta/projects/yolo/train.py
+++ b/official/vision/beta/projects/yolo/train.py
@@ -14,12 +14,12 @@
 
 """TensorFlow Model Garden Vision training driver."""
 
-from absl import app
-from absl import flags
+from absl import app, flags
 
 from official.common import flags as tfm_flags
 from official.vision.beta import train
-from official.vision.beta.projects.yolo.common import registry_imports  # pylint: disable=unused-import
+from official.vision.beta.projects.yolo.common import \
+    registry_imports  # pylint: disable=unused-import
 
 FLAGS = flags.FLAGS