diff --git a/api/common/env.py b/api/common/env.py
new file mode 100644
index 0000000000..ec84f19785
--- /dev/null
+++ b/api/common/env.py
@@ -0,0 +1,23 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+
+def benchmark_need_feed():
+    return os.environ.get("BENCHMARK_NEED_FEED", False)
+
+
+def benchmark_need_fetch():
+    return os.environ.get("BENCHMARK_NEED_FETCH", False)
diff --git a/api/common/launch.py b/api/common/launch.py
index 284fc6939f..eb89e25a3f 100644
--- a/api/common/launch.py
+++ b/api/common/launch.py
@@ -18,6 +18,7 @@
 import sys
 import argparse
 
+from common import env
 from common import system
 from common import api_param
 
@@ -33,9 +34,63 @@ def is_ampere_gpu():
     return False
 
 
+class TimeUnit(object):
+    def __init__(self):
+        self.kernel_time = 0.0
+        self.memory_time = 0.0
+        self.memcpy_h2d = 0.0
+        self.memcpy_d2h = 0.0
+        self.memcpy_d2d = 0.0
+        self.memset = 0.0
+
+    def total(self):
+        self.memory_time = self.memcpy_h2d + self.memcpy_d2d + self.memset
+        if not env.benchmark_need_fetch():
+            # Normally DtoH is fetching results.
+            self.memory_time += self.memcpy_d2h
+        return self.kernel_time + self.memory_time
+
+    def __str__(self):
+        total_time = self.total()
+        if env.benchmark_need_fetch():
+            infostr = "total gpu_time (exclude DtoH): {:.4f} ms ".format(
+                total_time)
+        else:
+            infostr = "total gpu_time: {:.4f} ms ".format(total_time)
+        if total_time > 0.0:
+            infostr += "(kernel: {:.4f} ms ({:.2f}%); memory: {:.4f} ms ({:.2f}%))".format(
+                self.kernel_time, self.kernel_time * 100 / total_time,
+                self.memory_time, self.memory_time * 100 / total_time)
+        else:
+            infostr += "(kernel: {:.4f} ms; memory: {:.4f} ms)".format(
+                self.kernel_time, self.memory_time)
+        infostr += "\n"
+        return infostr
+
+    def add_info(self, time, name):
+        if name == "[CUDA memcpy HtoD]":
+            self._update_memory_time("memcpy_h2d", time)
+        elif name == "[CUDA memcpy DtoH]":
+            self._update_memory_time("memcpy_d2h", time)
+        elif name == "[CUDA memcpy DtoD]":
+            self._update_memory_time("memcpy_d2d", time)
+        elif name == "[CUDA memset]":
+            self._update_memory_time("memset", time)
+        else:
+            self.kernel_time += time
+
+    def _update_memory_time(self, member_name, time):
+        assert member_name in [
+            "memcpy_h2d", "memcpy_d2h", "memcpy_d2d", "memset"
+        ]
+        setattr(self, member_name, time)
+        if member_name != "memcpy_d2h" or not env.benchmark_need_fetch():
+            self.memory_time += time
+
+
 class NvprofRunner(object):
-    def run(self, cmd):
-        stdout, exit_code = self._nvprof(cmd)
+    def run(self, cmd, profile_from_start=False):
+        stdout, exit_code = self._nvprof(cmd, profile_from_start)
         if exit_code == 0:
             parse_status, gpu_time = self._parse_logs(stdout.split("\n"))
             if parse_status:
@@ -43,9 +98,12 @@ def run(self, cmd):
         print("Running Error:\n {}".format(stdout))
         return 0.0
 
-    def _nvprof(self, cmd):
-        return system.run_command("nvprof --profile-from-start off {}".format(
-            cmd))
+    def _nvprof(self, cmd, profile_from_start):
+        if profile_from_start:
+            profile_cmd = "nvprof {}".format(cmd)
+        else:
+            profile_cmd = "nvprof --profile-from-start off {}".format(cmd)
+        return system.run_command(profile_cmd)
 
     def _parse_logs(self, logs):
         line_from = None
@@ -58,41 +116,44 @@ def _parse_logs(self, logs):
                 line_to = i
                 break
         if line_from is not None and line_to is not None:
+            time_unit = TimeUnit()
             for i in range(line_from, line_to):
                 print(logs[i])
+                if i >= line_from + 1:
+                    begin_pos = 2 if i == line_from + 1 else 0
+                    gpu_time, percent, function = self._parse_line(logs[i],
+                                                                   begin_pos)
+                    time_unit.add_info(gpu_time, function)
             print("")
-            return True, self._parse_gpu_time(logs[line_from + 1])
+            print(time_unit)
+            return True, time_unit.total()
         else:
             return False, 0.0
 
-    def _parse_gpu_time(self, line):
-        infos = line.strip().split()
-        percent = float(infos[2].replace("%", "")) * 0.01
-        gpu_time = infos[3]
-        if gpu_time.endswith("us"):
-            gpu_time = float(gpu_time.replace("us", "")) * 0.001
-        elif gpu_time.endswith("ms"):
-            gpu_time = float(gpu_time.replace("ms", ""))
-        elif gpu_time.endswith("s"):
-            gpu_time = float(gpu_time.replace("s", "")) * 1000
+    def _to_millisecond(self, timestr):
+        if timestr.endswith("us"):
+            return float(timestr.replace("us", "")) * 0.001
+        elif timestr.endswith("ms"):
+            return float(timestr.replace("ms", ""))
+        elif timestr.endswith("s"):
+            return float(timestr.replace("s", "")) * 1000
         else:
             raise ValueError("Invalid time: %s" % gpu_time)
-        calls = int(infos[4])
-        function = infos[8]
-        for i in range(9, len(infos)):
-            function = function + " " + infos[i]
-        #print("percent: %.2f; gpu_time: %.4f ms; calls: %d; function: %s" %
-        #      (percent, gpu_time, calls, function))
 
-        total_gpu_time = gpu_time / percent
-        print("total gpu_time: %.4f ms" % total_gpu_time)
-        print("")
-        return total_gpu_time
+    def _parse_line(self, line, begin_pos=0):
+        infos = line.strip().split()
+        percent = float(infos[begin_pos].replace("%", "")) * 0.01
+        gpu_time = self._to_millisecond(infos[begin_pos + 1])
+        calls = int(infos[begin_pos + 2])
+        function = infos[begin_pos + 6]
+        for i in range(begin_pos + 7, len(infos)):
+            function = function + " " + infos[i]
+        return gpu_time, percent, function
 
 
 class NsightRunner(object):
-    def run(self, cmd):
-        stdout, exit_code = self._nsight(cmd)
+    def run(self, cmd, profile_from_start=False):
+        stdout, exit_code = self._nsight(cmd, profile_from_start)
         if exit_code == 0:
             parse_status, gpu_time = self._parse_logs(stdout.split("\n"))
             if parse_status:
@@ -100,9 +161,13 @@ def run(self, cmd):
         print("Running Error:\n {}".format(stdout))
         return 0.0
 
-    def _nsight(self, cmd):
-        return system.run_command(
-            "nsys nvprof --profile-from-start=off -o tmp.qdrep {}".format(cmd))
+    def _nsight(self, cmd, profile_from_start):
+        if profile_from_start:
+            profile_cmd = "nsys nvprof -o tmp.qdrep {}".format(cmd)
+        else:
+            profile_cmd = "nsys nvprof --profile-from-start=off -o tmp.qdrep {}".format(
+                cmd)
+        return system.run_command(profile_cmd)
 
     def _parse_logs(self, logs):
         kernel_line_from = None
@@ -362,7 +427,8 @@ def launch(benchmark_script,
            task="speed",
            repeat=1,
            sync_interval=80,
-           with_nvprof=False):
+           with_nvprof=False,
+           profile_from_start=True):
     """
     If with_nvprof is True, it will launch the following command firstly to
     get the gpu_time:
@@ -371,7 +437,8 @@ def launch(benchmark_script,
     Then the normal testing command will be launched:
         python benchmark_script benchmark_script_args
     """
-    if with_nvprof:
+
+    if with_nvprof and not profile_from_start:
         if task == "speed":
             _set_args(benchmark_script_args, "--profiler", "nvprof")
         elif task == "scheduling":
@@ -384,7 +451,7 @@ def launch(benchmark_script,
                 runner = NsightRunner()
             else:
                 runner = NvprofRunner()
-            gpu_time = runner.run(cmd)
+            gpu_time = runner.run(cmd, profile_from_start)
             _set_args(benchmark_script_args, "--profiler", "none")
             return gpu_time
         elif task == "scheduling":
@@ -438,6 +505,7 @@ def _set_args(args, arg, value):
     args = parser.parse_args()
     benchmark_args_dict = _args_list_to_dict(args.benchmark_script_args)
     task = benchmark_args_dict.get("task", "speed")
+    framework = benchmark_args_dict.get("framework", "paddle")
     use_gpu = system.str2bool(benchmark_args_dict.get(
         "use_gpu", "False")) and os.environ.get("CUDA_VISIBLE_DEVICES",
                                                 None) != ""
@@ -448,13 +516,15 @@ def _set_args(args, arg, value):
     system.check_commit()
 
     if use_gpu and task in ["speed", "scheduling"] and profiler == "none":
+        profile_from_start = False
         output_time = launch(
             args.benchmark_script,
             args.benchmark_script_args,
             task,
             repeat,
             sync_interval,
-            with_nvprof=True)
+            with_nvprof=True,
+            profile_from_start=profile_from_start)
         if task == "speed":
             args.benchmark_script_args.append(" --gpu_time ")
             args.benchmark_script_args.append(str(output_time))
diff --git a/api/common/special_op_list.py b/api/common/special_op_list.py
index 32753ab1e1..4bdf713bb0 100644
--- a/api/common/special_op_list.py
+++ b/api/common/special_op_list.py
@@ -36,7 +36,6 @@
     "arange",
     "argmax",
     "argmin",
-    "argsort",
     "assign",
     "cast",
     "clip_by_norm",
diff --git a/api/common/tensorflow_op_benchmark.py b/api/common/tensorflow_op_benchmark.py
index b50f842d75..8036a5f617 100644
--- a/api/common/tensorflow_op_benchmark.py
+++ b/api/common/tensorflow_op_benchmark.py
@@ -21,12 +21,14 @@
 from common import special_op_list
 from common.benchmark import BenchmarkBase
 
+from . import env
 from . import utils
 from . import api_param
 from . import feeder
 
 try:
     import tensorflow as tf
+
     from tensorflow.python.profiler import model_analyzer
     from tensorflow.python.profiler import option_builder
     from tensorflow.core.protobuf import config_pb2
@@ -55,9 +57,9 @@ def __enter__(self):
             import cProfile
             self._profiler_handle = cProfile.Profile()
             self._profiler_handle.enable()
-        elif self.profiler != "none":
+        elif self.profiler == "native":
             self._profiler_handle = model_analyzer.Profiler(
-                graph=self.sess.graph)
+                graph=self._sess.graph)
             self.run_options = tf.compat.v1.RunOptions(
                 trace_level=tf.compat.v1.RunOptions.FULL_TRACE)
             self.run_metadata = tf.compat.v1.RunMetadata()
@@ -247,8 +249,10 @@ def generate_random_feeder(self,
             assert use_feed_fetch, "Argument use_feed_fetch must be True when feeder_adapter is initialized by paddle."
 
         if feeder_adapter is None or feeder_adapter.framework != "tensorflow":
-            self._need_feed = config.name == "feed"
-            self._need_fetch = use_feed_fetch or config.name == "fetch"
+            self._need_feed = env.benchmark_need_feed(
+            ) or config.name == "feed"
+            self._need_fetch = env.benchmark_need_fetch(
+            ) or use_feed_fetch or config.name == "fetch"
             self._feed_spec = feeder.copy_feed_spec(config.feed_spec)
             self._feed_dict = {}
 
@@ -294,12 +298,14 @@ def run(self, config, args, use_feed_fetch=True, feeder_adapter=None):
         self.fetch_list = fetch_list
 
         self.allow_growth = False if args.task == "speed" else True
-        outputs, stats = self.run_impl(
-            use_gpu=args.use_gpu,
-            config=config,
-            feed=feed,
-            repeat=args.repeat,
-            profiler=args.profiler)
+        device = "GPU:0" if args.use_gpu else "CPU"
+        with tf.device(device):
+            outputs, stats = self.run_impl(
+                use_gpu=args.use_gpu,
+                config=config,
+                feed=feed,
+                repeat=args.repeat,
+                profiler=args.profiler)
         return outputs, stats
 
     def _init_session(self, use_gpu):
diff --git a/api/deploy/collect_api_info.py b/api/deploy/collect_api_info.py
index eff774a8a0..40299d9b6d 100644
--- a/api/deploy/collect_api_info.py
+++ b/api/deploy/collect_api_info.py
@@ -51,8 +51,16 @@ def collect_subclass_dict(test_cases_dict):
 
 
 def import_all_tests(test_module_name):
+    def _is_special_module(api_name):
+        special_module_list = [
+            "__init__", "common_import", "test_main", "fused_"
+        ]
+        for name in special_module_list:
+            if name in api_name:
+                return True
+        return False
+
     test_cases_dict = {}
-    special_module_list = ["__init__", "common_import", "test_main"]
 
     def _import_api(test_module_name, basename):
         try:
@@ -68,7 +76,7 @@ def _import_api(test_module_name, basename):
     for filename in sorted(os.listdir(tests_path)):
         api_name = os.path.splitext(filename)[0]
         file_extension = os.path.splitext(filename)[1]
-        if file_extension == '.py' and api_name not in special_module_list:
+        if file_extension == '.py' and not _is_special_module(api_name):
             module = _import_api(test_module_name, api_name)
             if module:
                 test_cases_dict[api_name] = module
@@ -134,7 +142,7 @@ def main(args):
     parser.add_argument(
         '--test_module_name',
         type=str,
-        default="tests",
+        default="tests_v2",
         help='The module_name under benchmark/api (tests|tests_v2|dynamic_tests_v2).'
     )
     parser.add_argument(
diff --git a/api/run_op_benchmark.sh b/api/run_op_benchmark.sh
index f717998a78..156a92a65f 100755
--- a/api/run_op_benchmark.sh
+++ b/api/run_op_benchmark.sh
@@ -4,7 +4,8 @@ OP_BENCHMARK_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")" && pwd )"
 
 test_module_name=${1:-"dynamic_tests_v2"}  # "tests_v2", "dynamic_tests_v2"
 gpu_ids=${2:-"0"}
-op_type=${3:-"all"}  # "all" or specified op_type, such as elementwise
+model_name_op_type=${3:-"all"}  # "all" or specified model_name/op_type, such as elementwise
+config_subdir=${4:-"op_configs"}
 
 if [ ${test_module_name} != "tests_v2" ] && [ ${test_module_name} != "dynamic_tests_v2" ]; then
   echo "Please set test_module_name (${test_module_name}) to \"tests_v2\" or \"dynamic_tests_v2\"!"
@@ -63,10 +64,10 @@ run_op_benchmark() {
   bash ${OP_BENCHMARK_ROOT}/deploy/main_control.sh ${tests_dir} ${config_dir} ${output_dir} ${gpu_ids} "both" "both" "none" "both" "${testing_mode}" > ${log_path} 2>&1 &
 }
 
-run_specified_op() {
+run_specified_task() {
   local testing_mode=$1
 
-  OUTPUT_ROOT=${OP_BENCHMARK_ROOT}/logs/${op_type}
+  OUTPUT_ROOT=${OP_BENCHMARK_ROOT}/logs/${model_name_op_type}
   if [ ! -d ${OUTPUT_ROOT} ]; then
     mkdir -p ${OUTPUT_ROOT}
   fi
@@ -79,11 +80,11 @@ run_specified_op() {
   echo "-- output_dir: ${output_dir}"
   
   if [ "${test_module_name}" == "tests" ]; then
-    config_dir=${OP_BENCHMARK_ROOT}/tests/op_configs
-    op_list=${OUTPUT_ROOT}/api_info_${op_type}.txt
+    config_dir=${OP_BENCHMARK_ROOT}/tests/${config_subdir}
+    op_list=${OUTPUT_ROOT}/api_info_${model_name_op_type}.txt
   else
-    config_dir=${OP_BENCHMARK_ROOT}/tests_v2/op_configs
-    op_list=${OUTPUT_ROOT}/api_info_v2_${op_type}.txt
+    config_dir=${OP_BENCHMARK_ROOT}/tests_v2/${config_subdir}
+    op_list=${OUTPUT_ROOT}/api_info_v2_${model_name_op_type}.txt
   fi
   echo "-- config_dir: ${config_dir}"
  
@@ -101,15 +102,15 @@ main() {
     install_package "torch" "1.10.0"
   else
     testing_mode="static"
-    install_package "tensorflow" "2.3.1"
+    install_package "tensorflow" "2.8.0"
   fi
 
-  case ${op_type} in
+  case ${model_name_op_type} in
     all)
       run_op_benchmark ${testing_mode}
       ;;
     *)
-      run_specified_op ${testing_mode}
+      run_specified_task ${testing_mode}
       ;;
   esac
 }
diff --git a/api/tests_v2/batch_norm.py b/api/tests_v2/batch_norm.py
index e74978a91e..9350d6aa0a 100644
--- a/api/tests_v2/batch_norm.py
+++ b/api/tests_v2/batch_norm.py
@@ -16,8 +16,8 @@
 
 
 class BatchNormConfig(APIConfig):
-    def __init__(self):
-        super(BatchNormConfig, self).__init__('batch_norm')
+    def __init__(self, op_type="batch_norm"):
+        super(BatchNormConfig, self).__init__(op_type)
 
     def init_from_json(self, filename, config_id=0, unknown_dim=16):
         super(BatchNormConfig, self).init_from_json(filename, config_id,
diff --git a/api/tests_v2/fused_batch_norm_add_relu.py b/api/tests_v2/fused_batch_norm_add_relu.py
new file mode 100644
index 0000000000..7ba239c613
--- /dev/null
+++ b/api/tests_v2/fused_batch_norm_add_relu.py
@@ -0,0 +1,95 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from common_import import *
+from batch_norm import BatchNormConfig
+
+
+class FusedBatchNormAddReluConfig(BatchNormConfig):
+    def __init__(self):
+        super(FusedBatchNormAddReluConfig,
+              self).__init__("fused_batch_norm_add_relu")
+        self.alias_name = "batch_norm"
+
+
+class PDFusedBatchNormAddRelu(PaddleAPIBenchmarkBase):
+    def build_program(self, config):
+        def _create_parameter(name, value, stop_gradient):
+            param = paddle.create_parameter(
+                name=name,
+                shape=[config.num_channels],
+                dtype=config.x_dtype,
+                attr=paddle.ParamAttr(
+                    initializer=paddle.nn.initializer.Constant(value)))
+            param.stop_gradient = stop_gradient
+            return param
+
+        x = self.variable(name='x', shape=config.x_shape, dtype=config.x_dtype)
+        y = self.variable(name='y', shape=config.x_shape, dtype=config.x_dtype)
+
+        running_mean = _create_parameter(
+            name='running_mean', value=0.5, stop_gradient=True)
+        running_var = _create_parameter(
+            name='running_var', value=0.1, stop_gradient=True)
+
+        scale = _create_parameter(name='scale', value=0.5, stop_gradient=False)
+        bias = _create_parameter(name='bias', value=0.1, stop_gradient=False)
+
+        bn_out = paddle.nn.functional.batch_norm(
+            x=x,
+            running_mean=running_mean,
+            running_var=running_var,
+            weight=scale,
+            bias=bias,
+            epsilon=config.epsilon,
+            momentum=config.momentum,
+            training=config.training,
+            data_format=config.data_format)
+        add_out = bn_out + y
+        relu_out = paddle.nn.functional.relu(add_out)
+
+        self.feed_vars = [x, y]
+        self.fetch_vars = [bn_out, add_out, relu_out]
+        if config.backward:
+            self.append_gradients(relu_out, [x, scale, bias, bn_out, add_out])
+
+
+class TFFusedBatchNormAddRelu(TensorflowAPIBenchmarkBase):
+    def build_graph(self, config):
+        x = self.variable(name='x', shape=config.x_shape, dtype=config.x_dtype)
+        y = self.variable(name='y', shape=config.x_shape, dtype=config.x_dtype)
+        bn = tf.keras.layers.BatchNormalization(
+            axis=config.axis,
+            momentum=config.momentum,
+            epsilon=config.epsilon,
+            beta_initializer=tf.constant_initializer(0.1),
+            gamma_initializer=tf.constant_initializer(0.5),
+            moving_mean_initializer=tf.constant_initializer(0.5),
+            moving_variance_initializer=tf.constant_initializer(0.1))
+        bn_out = bn(x, training=config.training)
+        add_out = bn_out + y
+        relu_out = tf.nn.relu(add_out)
+
+        self.feed_list = [x, y]
+        self.fetch_list = [bn_out, add_out, relu_out]
+        if config.backward:
+            self.append_gradients(relu_out,
+                                  [x, bn.gamma, bn.beta, bn_out, add_out])
+
+
+if __name__ == '__main__':
+    test_main(
+        PDFusedBatchNormAddRelu(),
+        TFFusedBatchNormAddRelu(),
+        config=FusedBatchNormAddReluConfig())
diff --git a/api/tests_v2/fused_batch_norm_relu.py b/api/tests_v2/fused_batch_norm_relu.py
new file mode 100644
index 0000000000..326b972caf
--- /dev/null
+++ b/api/tests_v2/fused_batch_norm_relu.py
@@ -0,0 +1,89 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from common_import import *
+from batch_norm import BatchNormConfig
+
+
+class FusedBatchNormReluConfig(BatchNormConfig):
+    def __init__(self):
+        super(FusedBatchNormReluConfig, self).__init__("fused_batch_norm_relu")
+        self.alias_name = "batch_norm"
+
+
+class PDFusedBatchNormRelu(PaddleAPIBenchmarkBase):
+    def build_program(self, config):
+        def _create_parameter(name, value, stop_gradient):
+            param = paddle.create_parameter(
+                name=name,
+                shape=[config.num_channels],
+                dtype=config.x_dtype,
+                attr=paddle.ParamAttr(
+                    initializer=paddle.nn.initializer.Constant(value)))
+            param.stop_gradient = stop_gradient
+            return param
+
+        x = self.variable(name='x', shape=config.x_shape, dtype=config.x_dtype)
+
+        running_mean = _create_parameter(
+            name='running_mean', value=0.5, stop_gradient=True)
+        running_var = _create_parameter(
+            name='running_var', value=0.1, stop_gradient=True)
+
+        scale = _create_parameter(name='scale', value=0.5, stop_gradient=False)
+        bias = _create_parameter(name='bias', value=0.1, stop_gradient=False)
+
+        bn_out = paddle.nn.functional.batch_norm(
+            x=x,
+            running_mean=running_mean,
+            running_var=running_var,
+            weight=scale,
+            bias=bias,
+            epsilon=config.epsilon,
+            momentum=config.momentum,
+            training=config.training,
+            data_format=config.data_format)
+        relu_out = paddle.nn.functional.relu(bn_out)
+
+        self.feed_vars = [x]
+        self.fetch_vars = [bn_out, relu_out]
+        if config.backward:
+            self.append_gradients(relu_out, [x, scale, bias, bn_out])
+
+
+class TFFusedBatchNormRelu(TensorflowAPIBenchmarkBase):
+    def build_graph(self, config):
+        x = self.variable(name='x', shape=config.x_shape, dtype=config.x_dtype)
+        bn = tf.keras.layers.BatchNormalization(
+            axis=config.axis,
+            momentum=config.momentum,
+            epsilon=config.epsilon,
+            beta_initializer=tf.constant_initializer(0.1),
+            gamma_initializer=tf.constant_initializer(0.5),
+            moving_mean_initializer=tf.constant_initializer(0.5),
+            moving_variance_initializer=tf.constant_initializer(0.1))
+        bn_out = bn(x, training=config.training)
+        relu_out = tf.nn.relu(bn_out)
+
+        self.feed_list = [x]
+        self.fetch_list = [bn_out, relu_out]
+        if config.backward:
+            self.append_gradients(relu_out, [x, bn.gamma, bn.beta, bn_out])
+
+
+if __name__ == '__main__':
+    test_main(
+        PDFusedBatchNormRelu(),
+        TFFusedBatchNormRelu(),
+        config=FusedBatchNormReluConfig())
diff --git a/ci/scripts/run_test.sh b/ci/scripts/run_test.sh
index 9518d6f160..ceb05b8a3a 100644
--- a/ci/scripts/run_test.sh
+++ b/ci/scripts/run_test.sh
@@ -74,6 +74,7 @@ function run_api(){
     LOG "[INFO] Found ${file} modified."
     api=${file#*api/} && api=${api%.*}
     [[ "$api" =~ "common_import" ]] && continue
+    [[ "$api" =~ "fused_" ]] && continue
     [[ "$api" =~ "test_main" ]] && continue
     [[ "$api" =~ "__init__" ]] && continue
     [ -f "${BENCHMARK_ROOT}/api/${api}.py" ] && API_NAMES[${#API_NAMES[@]}]=$api