From 9801241c4fdffa6baddb28abe4ca99390f78a91f Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Tue, 4 Jan 2022 13:13:51 +0000
Subject: [PATCH 01/12] Update the supported tf'version to 2.7.0 and allow
 precisely profiling.

---
 api/common/launch.py                   | 41 ++++++++-----
 api/common/tensorflow_api_benchmark.py | 32 ++++++----
 api/common/utils.py                    | 18 +++---
 api/run_op_benchmark.sh                |  2 +-
 api/tests_v2/batch_norm.py             | 82 +++++++++++---------------
 api/tests_v2/configs/batch_norm.json   |  6 +-
 6 files changed, 96 insertions(+), 85 deletions(-)

diff --git a/api/common/launch.py b/api/common/launch.py
index 623de5caf6..463a9710cd 100644
--- a/api/common/launch.py
+++ b/api/common/launch.py
@@ -34,8 +34,8 @@ def is_ampere_gpu():
 
 
 class NvprofRunner(object):
-    def run(self, cmd):
-        stdout, exit_code = self._nvprof(cmd)
+    def run(self, cmd, profile_from_start=False):
+        stdout, exit_code = self._nvprof(cmd, profile_from_start)
         if exit_code == 0:
             parse_status, gpu_time = self._parse_logs(stdout.split("\n"))
             if parse_status:
@@ -43,9 +43,12 @@ def run(self, cmd):
         print("Running Error:\n {}".format(stdout))
         return 0.0
 
-    def _nvprof(self, cmd):
-        return system.run_command("nvprof --profile-from-start off {}".format(
-            cmd))
+    def _nvprof(self, cmd, profile_from_start):
+        if profile_from_start:
+            profile_cmd = "nvprof {}".format(cmd)
+        else:
+            profile_cmd = "nvprof --profile-from-start off {}".format(cmd)
+        return system.run_command(profile_cmd)
 
     def _parse_logs(self, logs):
         line_from = None
@@ -91,8 +94,8 @@ def _parse_gpu_time(self, line):
 
 
 class NsightRunner(object):
-    def run(self, cmd):
-        stdout, exit_code = self._nsight(cmd)
+    def run(self, cmd, profile_from_start=False):
+        stdout, exit_code = self._nsight(cmd, profile_from_start)
         if exit_code == 0:
             parse_status, gpu_time = self._parse_logs(stdout.split("\n"))
             if parse_status:
@@ -100,9 +103,13 @@ def run(self, cmd):
         print("Running Error:\n {}".format(stdout))
         return 0.0
 
-    def _nsight(self, cmd):
-        return system.run_command(
-            "nsys nvprof --profile-from-start=off -o tmp.qdrep {}".format(cmd))
+    def _nsight(self, cmd, profile_from_start):
+        if profile_from_start:
+            profile_cmd = "nsys nvprof -o tmp.qdrep {}".format(cmd)
+        else:
+            profile_cmd = "nsys nvprof --profile-from-start=off -o tmp.qdrep {}".format(
+                cmd)
+        return system.run_command(profile_cmd)
 
     def _parse_logs(self, logs):
         kernel_line_from = None
@@ -168,7 +175,10 @@ def _parse_gpu_time(self, line):
         return gpu_time / percent
 
 
-def launch(benchmark_script, benchmark_script_args, with_nvprof=False):
+def launch(benchmark_script,
+           benchmark_script_args,
+           with_nvprof=False,
+           profile_from_start=True):
     """
     If with_nvprof is True, it will launch the following command firstly to
     get the gpu_time:
@@ -188,7 +198,7 @@ def _set_profiler(args, value):
             args.append("--profiler")
             args.append(value)
 
-    if with_nvprof:
+    if with_nvprof and not profile_from_start:
         _set_profiler(benchmark_script_args, "nvprof")
     cmd = "{} {} {}".format(sys.executable, benchmark_script,
                             " ".join(benchmark_script_args))
@@ -197,7 +207,7 @@ def _set_profiler(args, value):
             runner = NsightRunner()
         else:
             runner = NvprofRunner()
-        gpu_time = runner.run(cmd)
+        gpu_time = runner.run(cmd, profile_from_start)
         _set_profiler(benchmark_script_args, "none")
         return gpu_time
     else:
@@ -234,6 +244,7 @@ def _args_list_to_dict(arg_list):
     args = parser.parse_args()
     benchmark_args_dict = _args_list_to_dict(args.benchmark_script_args)
     task = benchmark_args_dict.get("task", "speed")
+    framework = benchmark_args_dict.get("framework", "paddle")
     use_gpu = system.str2bool(benchmark_args_dict.get(
         "use_gpu", "False")) and os.environ.get("CUDA_VISIBLE_DEVICES",
                                                 None) != ""
@@ -243,10 +254,12 @@ def _args_list_to_dict(arg_list):
     system.check_commit()
 
     if use_gpu and task == "speed" and profiler == "none":
+        profile_from_start = False
         total_gpu_time = launch(
             args.benchmark_script,
             args.benchmark_script_args,
-            with_nvprof=True)
+            with_nvprof=True,
+            profile_from_start=profile_from_start)
         args.benchmark_script_args.append(" --gpu_time ")
         args.benchmark_script_args.append(str(total_gpu_time))
 
diff --git a/api/common/tensorflow_api_benchmark.py b/api/common/tensorflow_api_benchmark.py
index 9e3a65f01b..eb33a22285 100644
--- a/api/common/tensorflow_api_benchmark.py
+++ b/api/common/tensorflow_api_benchmark.py
@@ -40,21 +40,25 @@
 class Profiler(object):
     def __init__(self, name, sess, profiler):
         self.name = name
-        self.sess = sess
         self.profiler = profiler
-        self.profiler_handle = None
+        self._sess = sess
+        self._profiler_handle = None
         self.run_options = None
         self.run_metadata = None
         self.generate_timeline = False
 
     def __enter__(self):
-        if self.profiler == "pyprof":
+        if self.profiler == "nvprof":
+            import ctypes
+            self._cudart = ctypes.CDLL('libcudart.so')
+            self._cudart.cudaProfilerStart()
+        elif self.profiler == "pyprof":
             import cProfile
-            self.profiler_handle = cProfile.Profile()
-            self.profiler_handle.enable()
-        elif self.profiler != "none":
-            self.profiler_handle = model_analyzer.Profiler(
-                graph=self.sess.graph)
+            self._profiler_handle = cProfile.Profile()
+            self._profiler_handle.enable()
+        elif self.profiler == "native":
+            self._profiler_handle = model_analyzer.Profiler(
+                graph=self._sess.graph)
             if tf.__version__ < "1.15.0":
                 self.run_options = tf.RunOptions(
                     trace_level=tf.RunOptions.FULL_TRACE)
@@ -66,9 +70,9 @@ def __enter__(self):
         return self
 
     def add_step(self, step):
-        if self.profiler != "none" and self.profiler != "pyprof":
+        if self.profiler == "native":
             # Update profiler
-            self.profiler_handle.add_step(
+            self._profiler_handle.add_step(
                 step=step, run_meta=self.run_metadata)
             if self.generate_timeline:
                 # For timeline
@@ -78,16 +82,18 @@ def add_step(self, step):
                 trace_file.write(chrome_trace)
 
     def __exit__(self, exception_type, exception_value, traceback):
-        if self.profiler == "pyprof":
+        if self.profiler == "nvprof":
+            self._cudart.cudaProfilerStop()
+        elif self.profiler == "pyprof":
             import pstats, StringIO
-            self.profiler_handle.disable()
+            self._profiler_handle.disable()
             # self.profiler_handle.dump_stats("./outputs/" + self.name + ".pyprof")
             s = StringIO.StringIO()
             ps = pstats.Stats(
                 self.profiler_handle, stream=s).sort_stats("cumulative")
             ps.print_stats()
             print(s.getvalue())
-        elif self.profiler != "none":
+        elif self.profiler == "native":
             # Generate profiling result
             profile_op_builder = option_builder.ProfileOptionBuilder().select(
                 ['micros', 'occurrence']).order_by('micros').with_max_depth(5)
diff --git a/api/common/utils.py b/api/common/utils.py
index db0b6d8cc4..a6e81eff71 100644
--- a/api/common/utils.py
+++ b/api/common/utils.py
@@ -176,13 +176,17 @@ def check_outputs(output_list,
             target = target_list[i]
 
             if testing_mode == "static":
-                if isinstance(
-                        target,
-                        tf.python.framework.indexed_slices.IndexedSlicesValue):
-                    print(
-                        "---- Warning: Th %d-th target's type is IndexedSlicesValue and the check is skipped. "
-                        "It will be fixed later." % i)
-                    continue
+                try:
+                    if isinstance(target, tf.python.framework.indexed_slices.
+                                  IndexedSlicesValue):
+                        print(
+                            "---- Warning: Th %d-th target's type is IndexedSlicesValue and the check is skipped. "
+                            "It will be fixed later." % i)
+                        continue
+                except Exception as e:
+                    if tf.__version__ < "2.4.0":
+                        # I am not sure about the exact version
+                        print("Meets an exception: {}".format(e))
 
             output, target = _check_type(output, target)
             output, target = _check_shape(name, output, target, i)
diff --git a/api/run_op_benchmark.sh b/api/run_op_benchmark.sh
index 42d5e993c2..a754353511 100755
--- a/api/run_op_benchmark.sh
+++ b/api/run_op_benchmark.sh
@@ -101,7 +101,7 @@ main() {
     install_package "torch" "1.10.0"
   else
     testing_mode="static"
-    install_package "tensorflow" "2.3.1"
+    install_package "tensorflow" "2.7.0"
   fi
 
   case ${op_type} in
diff --git a/api/tests_v2/batch_norm.py b/api/tests_v2/batch_norm.py
index 3b786e42d2..ebaa7796bb 100644
--- a/api/tests_v2/batch_norm.py
+++ b/api/tests_v2/batch_norm.py
@@ -22,14 +22,6 @@ def __init__(self):
     def init_from_json(self, filename, config_id=0, unknown_dim=16):
         super(BatchNormConfig, self).init_from_json(filename, config_id,
                                                     unknown_dim)
-        # tf's batch_norm does not have data_format param, it only support NHWC format.
-        if self.data_format == "NCHW":
-            print(
-                "Warning:\n"
-                "  1. tf's batch_norm does not have data_format param, it only support NHWC format.\n"
-            )
-            self.run_tf = False
-
         if len(self.x_shape) == 4:
             if self.data_format == "NCHW":
                 self.num_channels = self.x_shape[1]
@@ -40,49 +32,47 @@ def init_from_json(self, filename, config_id=0, unknown_dim=16):
 
     def to_tensorflow(self):
         tf_config = super(BatchNormConfig, self).to_tensorflow()
-        if len(tf_config.x_shape) == 4:
-            tf_config.axes = [0, 1, 2]
+        if len(self.x_shape) == 4:
+            tf_config.axis = 1 if self.data_format == "NCHW" else 3
         else:
-            tf_config.axes = [0]
+            tf_config.axis = 1
         return tf_config
 
 
 class PDBatchNorm(PaddleAPIBenchmarkBase):
     def build_program(self, config):
+        def _create_parameter(name, value, stop_gradient):
+            param = paddle.create_parameter(
+                name=name,
+                shape=[config.num_channels],
+                dtype=config.x_dtype,
+                attr=paddle.ParamAttr(
+                    initializer=paddle.nn.initializer.Constant(value)))
+            param.stop_gradient = stop_gradient
+            return param
+
         x = self.variable(name='x', shape=config.x_shape, dtype=config.x_dtype)
 
-        running_mean = paddle.create_parameter(
-            name='running_mean',
-            shape=[config.num_channels],
-            dtype=config.x_dtype,
-            attr=paddle.ParamAttr(
-                initializer=paddle.nn.initializer.Constant(0.5)))
-        running_mean.stop_gradient = True
-        running_var = paddle.create_parameter(
-            name='running_var',
-            shape=[config.num_channels],
-            dtype=config.x_dtype,
-            attr=paddle.ParamAttr(
-                initializer=paddle.nn.initializer.Constant(0.1)))
-        running_var.stop_gradient = True
-
-        scale = self.variable(
-            name='scale', shape=[config.num_channels], dtype=config.x_dtype)
-        bias = self.variable(
-            name='bias', shape=[config.num_channels], dtype=config.x_dtype)
+        running_mean = _create_parameter(
+            name='running_mean', value=0.5, stop_gradient=True)
+        running_var = _create_parameter(
+            name='running_var', value=0.1, stop_gradient=True)
+
+        scale = _create_parameter(name='scale', value=0.5, stop_gradient=False)
+        bias = _create_parameter(name='bias', value=0.1, stop_gradient=False)
 
         result = paddle.nn.functional.batch_norm(
             x=x,
             running_mean=running_mean,
             running_var=running_var,
-            weight=scale,  # scale
-            bias=bias,  # bias
+            weight=scale,
+            bias=bias,
             epsilon=config.epsilon,
             momentum=config.momentum,
             training=config.training,
             data_format=config.data_format)
 
-        self.feed_vars = [x, scale, bias]
+        self.feed_vars = [x]
         self.fetch_vars = [result]
         if config.backward:
             self.append_gradients(result, [x, scale, bias])
@@ -91,24 +81,20 @@ def build_program(self, config):
 class TFBatchNorm(TensorflowAPIBenchmarkBase):
     def build_graph(self, config):
         x = self.variable(name='x', shape=config.x_shape, dtype=config.x_dtype)
-        scale = self.variable(
-            name='scale', shape=[config.num_channels], dtype=config.x_dtype)
-        bias = self.variable(
-            name='bias', shape=[config.num_channels], dtype=config.x_dtype)
-        mean, var = tf.nn.moments(
-            x=x, axes=config.axes, shift=None, keepdims=False)
-        result = tf.nn.batch_normalization(
-            x=x,
-            mean=mean,
-            variance=var,
-            offset=bias,
-            scale=scale,
-            variance_epsilon=config.epsilon)
+        bn = tf.keras.layers.BatchNormalization(
+            axis=config.axis,
+            momentum=config.momentum,
+            epsilon=config.epsilon,
+            beta_initializer=tf.constant_initializer(0.1),
+            gamma_initializer=tf.constant_initializer(0.5),
+            moving_mean_initializer=tf.constant_initializer(0.5),
+            moving_variance_initializer=tf.constant_initializer(0.1))
+        result = bn(x, training=config.training)
 
-        self.feed_list = [x, scale, bias]
+        self.feed_list = [x]
         self.fetch_list = [result]
         if config.backward:
-            self.append_gradients(result, [x, scale, bias])
+            self.append_gradients(result, [x, bn.gamma, bn.beta])
 
 
 if __name__ == '__main__':
diff --git a/api/tests_v2/configs/batch_norm.json b/api/tests_v2/configs/batch_norm.json
index d54219ae78..10e8e01218 100644
--- a/api/tests_v2/configs/batch_norm.json
+++ b/api/tests_v2/configs/batch_norm.json
@@ -23,7 +23,8 @@
             "type": "float",
             "value": "0.9"
         }
-    }
+    },
+    "atol": 1E-5
 }, {
     "config_id": 1,
     "op": "batch_norm",
@@ -49,7 +50,8 @@
             "type": "float",
             "value": "0.9"
         }
-    }
+    },
+    "atol": 1E-4
 }, {
     "config_id": 2,
     "op": "batch_norm",

From 6702a92b7be821acd925c14ef5f9fab65c7edadd Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Tue, 4 Jan 2022 14:09:31 +0000
Subject: [PATCH 02/12] Remove the support of tf under 1.15.0.

---
 api/common/tensorflow_api_benchmark.py | 72 +++++++++++---------------
 api/tests_v2/while_loop.py             | 22 +++-----
 2 files changed, 37 insertions(+), 57 deletions(-)

diff --git a/api/common/tensorflow_api_benchmark.py b/api/common/tensorflow_api_benchmark.py
index eb33a22285..de22b091a4 100644
--- a/api/common/tensorflow_api_benchmark.py
+++ b/api/common/tensorflow_api_benchmark.py
@@ -28,6 +28,7 @@
 
 try:
     import tensorflow as tf
+
     from tensorflow.python.profiler import model_analyzer
     from tensorflow.python.profiler import option_builder
     from tensorflow.core.protobuf import config_pb2
@@ -59,14 +60,9 @@ def __enter__(self):
         elif self.profiler == "native":
             self._profiler_handle = model_analyzer.Profiler(
                 graph=self._sess.graph)
-            if tf.__version__ < "1.15.0":
-                self.run_options = tf.RunOptions(
-                    trace_level=tf.RunOptions.FULL_TRACE)
-                self.run_metadata = tf.RunMetadata()
-            else:
-                self.run_options = tf.compat.v1.RunOptions(
-                    trace_level=tf.compat.v1.RunOptions.FULL_TRACE)
-                self.run_metadata = tf.compat.v1.RunMetadata()
+            self.run_options = tf.compat.v1.RunOptions(
+                trace_level=tf.compat.v1.RunOptions.FULL_TRACE)
+            self.run_metadata = tf.compat.v1.RunMetadata()
         return self
 
     def add_step(self, step):
@@ -90,14 +86,15 @@ def __exit__(self, exception_type, exception_value, traceback):
             # self.profiler_handle.dump_stats("./outputs/" + self.name + ".pyprof")
             s = StringIO.StringIO()
             ps = pstats.Stats(
-                self.profiler_handle, stream=s).sort_stats("cumulative")
+                self._profiler_handle, stream=s).sort_stats("cumulative")
             ps.print_stats()
             print(s.getvalue())
         elif self.profiler == "native":
             # Generate profiling result
             profile_op_builder = option_builder.ProfileOptionBuilder().select(
                 ['micros', 'occurrence']).order_by('micros').with_max_depth(5)
-            self.profiler_handle.profile_operations(profile_op_builder.build())
+            self._profiler_handle.profile_operations(profile_op_builder.build(
+            ))
         return self
 
 
@@ -167,8 +164,9 @@ def __init__(self):
         try:
             import tensorflow as tf
             self.graph = tf.Graph()
-            if tf.__version__ > "1.15.0":
-                tf.compat.v1.disable_eager_execution()
+            assert tf.__version__ >= "1.15.0", "The installed tensorflow's version is expected to be newer than 1.15.0, but recieved {}".format(
+                tf.__version__)
+            tf.compat.v1.disable_eager_execution()
         except Exception as e:
             sys.stderr.write(
                 "Cannot import tensorflow, maybe tensorflow is not installed.\n"
@@ -180,11 +178,7 @@ def build_graph(self, config=None):
 
     def placeholder(self, name, shape, dtype):
         tf_dtype = tf.as_dtype(dtype)
-        if tf.__version__ >= "1.15.0":
-            var = tf.compat.v1.placeholder(
-                name=name, shape=shape, dtype=tf_dtype)
-        else:
-            var = tf.placeholder(name=name, shape=shape, dtype=tf_dtype)
+        var = tf.compat.v1.placeholder(name=name, shape=shape, dtype=tf_dtype)
         return var
 
     def variable(self, name, shape, dtype, value=None):
@@ -272,8 +266,6 @@ def _run_null_graph(self, use_gpu, repeat):
     def run_impl(self, use_gpu, feed, repeat=1, profiler="none"):
         sess = self._init_session(use_gpu)
 
-        #tf.debugging.set_log_device_placement(True)
-
         def _run_main_iter(run_options=None, run_metadata=None):
             feed_dict = feed if self._need_feed else None
             if self._need_fetch:
@@ -378,31 +370,27 @@ def run(self, config, args, use_feed_fetch=True, feeder_adapter=None):
         self.fetch_list = fetch_list
 
         self.allow_growth = False if args.task == "speed" else True
-        outputs, stats = self.run_impl(
-            use_gpu=args.use_gpu,
-            feed=feed,
-            repeat=args.repeat,
-            profiler=args.profiler)
+        device = "GPU:0" if args.use_gpu else "CPU"
+        with tf.device(device):
+            outputs, stats = self.run_impl(
+                use_gpu=args.use_gpu,
+                feed=feed,
+                repeat=args.repeat,
+                profiler=args.profiler)
         return outputs, stats
 
     def _init_session(self, use_gpu):
-        if tf.__version__ >= "1.15.0":
-            config = tf.compat.v1.ConfigProto()
-            if use_gpu:
-                config.gpu_options.allow_growth = self.allow_growth
-            else:
-                # In default, TF use full cpu cores, but Paddle use one cpu core.
-                # To make the same experiment, set TF use one cpu core as well.
-                # See https://github.com/PaddlePaddle/Paddle/issues/18665#issuecomment-513780210
-                config.intra_op_parallelism_threads = 1
-                config.inter_op_parallelism_threads = 1
-            sess = tf.compat.v1.Session(config=config)
-            sess.run(tf.compat.v1.global_variables_initializer())
-            sess.run(tf.compat.v1.local_variables_initializer())
-        else:
-            config = tf.ConfigProto()
+        config = tf.compat.v1.ConfigProto()
+        if use_gpu:
             config.gpu_options.allow_growth = self.allow_growth
-            sess = tf.Session(config=config)
-            sess.run(tf.global_variables_initializer())
-            sess.run(tf.local_variables_initializer())
+            config.graph_options.optimizer_options.global_jit_level = 2
+        else:
+            # In default, TF use full cpu cores, but Paddle use one cpu core.
+            # To make the same experiment, set TF use one cpu core as well.
+            # See https://github.com/PaddlePaddle/Paddle/issues/18665#issuecomment-513780210
+            config.intra_op_parallelism_threads = 1
+            config.inter_op_parallelism_threads = 1
+        sess = tf.compat.v1.Session(config=config)
+        sess.run(tf.compat.v1.global_variables_initializer())
+        sess.run(tf.compat.v1.local_variables_initializer())
         return sess
diff --git a/api/tests_v2/while_loop.py b/api/tests_v2/while_loop.py
index 95d471d828..bba3b22d0f 100644
--- a/api/tests_v2/while_loop.py
+++ b/api/tests_v2/while_loop.py
@@ -73,21 +73,13 @@ def cond(i, loop_len, input, result):
             return i < loop_len
 
         def body(i, loop_len, input, result):
-            if tf.__version__ <= "1.15.0":
-                result = tf.contrib.layers.fully_connected(
-                    inputs=input,
-                    num_outputs=config.size,
-                    weights_initializer=tf.constant_initializer(0.5),
-                    biases_initializer=tf.constant_initializer(0.1),
-                    activation_fn=None)
-            else:
-                result = tf.compat.v1.layers.dense(
-                    inputs=input,
-                    units=config.size,
-                    activation=None,
-                    use_bias=True,
-                    kernel_initializer=tf.constant_initializer(0.5),
-                    bias_initializer=tf.constant_initializer(0.1))
+            result = tf.compat.v1.layers.dense(
+                inputs=input,
+                units=config.size,
+                activation=None,
+                use_bias=True,
+                kernel_initializer=tf.constant_initializer(0.5),
+                bias_initializer=tf.constant_initializer(0.1))
             return [i + 1, loop_len, input, result]
 
         input = self.variable(

From f4ed56fb2279637553a2973d73cc692f6d306baf Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Tue, 4 Jan 2022 14:18:25 +0000
Subject: [PATCH 03/12] Allow the control the benchmark through environ.

---
 api/common/env.py                      | 23 +++++++++++++++++++++++
 api/common/tensorflow_api_benchmark.py |  7 +++++--
 2 files changed, 28 insertions(+), 2 deletions(-)
 create mode 100644 api/common/env.py

diff --git a/api/common/env.py b/api/common/env.py
new file mode 100644
index 0000000000..00d82a33ba
--- /dev/null
+++ b/api/common/env.py
@@ -0,0 +1,23 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+
+def benchmark_need_feed():
+    return os.environ.get("BENCHMARK_NEED_FEED", False)
+
+
+def benchmark_need_fetch():
+    return os.environ.get("BENCHMARK_NEED_FETCH", False)
diff --git a/api/common/tensorflow_api_benchmark.py b/api/common/tensorflow_api_benchmark.py
index de22b091a4..5ac07d5d97 100644
--- a/api/common/tensorflow_api_benchmark.py
+++ b/api/common/tensorflow_api_benchmark.py
@@ -22,6 +22,7 @@
 import numpy as np
 from common import special_op_list
 
+from . import env
 from . import utils
 from . import api_param
 from . import feeder
@@ -323,8 +324,10 @@ def generate_random_feeder(self,
             assert use_feed_fetch, "Argument use_feed_fetch must be True when feeder_adapter is initialized by paddle."
 
         if feeder_adapter is None or feeder_adapter.framework != "tensorflow":
-            self._need_feed = config.name == "feed"
-            self._need_fetch = use_feed_fetch or config.name == "fetch"
+            self._need_feed = env.benchmark_need_feed(
+            ) or config.name == "feed"
+            self._need_fetch = env.benchmark_need_fetch(
+            ) or use_feed_fetch or config.name == "fetch"
             self._feed_spec = feeder.copy_feed_spec(config.feed_spec)
             self._feed_dict = {}
 

From 6ec9b0f6b2680dd2ccf9b628dd95218d3043e79d Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Wed, 5 Jan 2022 03:11:46 +0000
Subject: [PATCH 04/12] Remove the set of optimizer options and fix showing
 bugs when writing the summary results to excel.

---
 api/common/tensorflow_api_benchmark.py | 1 -
 api/deploy/write_excel.py              | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/api/common/tensorflow_api_benchmark.py b/api/common/tensorflow_api_benchmark.py
index 5ac07d5d97..3978ce769f 100644
--- a/api/common/tensorflow_api_benchmark.py
+++ b/api/common/tensorflow_api_benchmark.py
@@ -386,7 +386,6 @@ def _init_session(self, use_gpu):
         config = tf.compat.v1.ConfigProto()
         if use_gpu:
             config.gpu_options.allow_growth = self.allow_growth
-            config.graph_options.optimizer_options.global_jit_level = 2
         else:
             # In default, TF use full cpu cores, but Paddle use one cpu core.
             # To make the same experiment, set TF use one cpu core as well.
diff --git a/api/deploy/write_excel.py b/api/deploy/write_excel.py
index 3bcb9f1a3c..18e4726c80 100644
--- a/api/deploy/write_excel.py
+++ b/api/deploy/write_excel.py
@@ -191,12 +191,12 @@ def _write_title_and_set_column_width(worksheet, device, compare_framework,
             column_width.append(16)
             column_width.append(16)
             column_width.append(16)
-        if device == "gpu" and direction == "forward":
+        if device == "gpu" and direction in ["forward", "backward"]:
             title_names.append("paddle(gflops)")
             title_names.append("paddle(gbs)")
         title_names.append("accuracy")
         title_names.append("parameters")
-        if device == "gpu" and direction == "forward":
+        if device == "gpu" and direction in ["forward", "backward"]:
             column_width.append(16)
             column_width.append(16)
         column_width.append(10)

From fac27de6fd71bcb42ae7a73244056fc711230d0d Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Wed, 5 Jan 2022 14:06:37 +0000
Subject: [PATCH 05/12] Remove argsort from no_backward_ops list.

---
 api/common/special_op_list.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/api/common/special_op_list.py b/api/common/special_op_list.py
index cf4332a4f8..bd4409a759 100644
--- a/api/common/special_op_list.py
+++ b/api/common/special_op_list.py
@@ -36,7 +36,6 @@
     "arange",
     "argmax",
     "argmin",
-    "argsort",
     "assign",
     "cast",
     "clip_by_norm",

From aac10741eb93ccfd1ca4034438991f9f1d104e22 Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Thu, 6 Jan 2022 03:12:36 +0000
Subject: [PATCH 06/12] Change the approveal github ids.

---
 ci/scripts/check_approval.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/scripts/check_approval.sh b/ci/scripts/check_approval.sh
index a870d6ccbf..9fab441968 100644
--- a/ci/scripts/check_approval.sh
+++ b/ci/scripts/check_approval.sh
@@ -25,7 +25,7 @@ BENCHMARK_ROOT=$(cd $(dirname $0)/../../ && pwd)
 
 declare -A FILE_APPROVAL_USER_MAP
 FILE_APPROVAL_USER_MAP=(
-  ["api/common/special_op_list.py"]="GaoWei8 wangchaochaohu zhangting2020"
+  ["api/common/special_op_list.py"]="JamesLim-sy ZzSean zhangting2020"
 )
 
 LOG "[INFO] Get approval list ..."

From cf162b2406b703d7d7f7a4a5c974f77de668ecb6 Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Fri, 7 Jan 2022 10:03:38 +0000
Subject: [PATCH 07/12] Exclude the DtoH time when needs fetch.

---
 api/common/launch.py | 100 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 79 insertions(+), 21 deletions(-)

diff --git a/api/common/launch.py b/api/common/launch.py
index 463a9710cd..d3e9da005f 100644
--- a/api/common/launch.py
+++ b/api/common/launch.py
@@ -18,6 +18,7 @@
 import sys
 import argparse
 
+from common import env
 from common import system
 from common import api_param
 
@@ -33,6 +34,60 @@ def is_ampere_gpu():
     return False
 
 
+class TimeUnit(object):
+    def __init__(self):
+        self.kernel_time = 0.0
+        self.memory_time = 0.0
+        self.memcpy_h2d = 0.0
+        self.memcpy_d2h = 0.0
+        self.memcpy_d2d = 0.0
+        self.memset = 0.0
+
+    def total(self):
+        self.memory_time = self.memcpy_h2d + self.memcpy_d2d + self.memset
+        if not env.benchmark_need_fetch():
+            # Normally DtoH is fetching results.
+            self.memory_time += self.memcpy_d2h
+        return self.kernel_time + self.memory_time
+
+    def __str__(self):
+        total_time = self.total()
+        if env.benchmark_need_fetch():
+            infostr = "total gpu_time (exclude DtoH): {:.4f} ms ".format(
+                total_time)
+        else:
+            infostr = "total gpu_time: {:.4f} ms ".format(total_time)
+        if total_time > 0.0:
+            infostr += "(kernel: {:.4f} ms ({:.2f}%); memory: {:.4f} ms ({:.2f}%))".format(
+                self.kernel_time, self.kernel_time * 100 / total_time,
+                self.memory_time, self.memory_time * 100 / total_time)
+        else:
+            infostr += "(kernel: {:.4f} ms; memory: {:.4f} ms)".format(
+                self.kernel_time, self.memory_time)
+        infostr += "\n"
+        return infostr
+
+    def add_info(self, time, name):
+        if name == "[CUDA memcpy HtoD]":
+            self._update_memory_time("memcpy_h2d", time)
+        elif name == "[CUDA memcpy DtoH]":
+            self._update_memory_time("memcpy_d2h", time)
+        elif name == "[CUDA memcpy DtoD]":
+            self._update_memory_time("memcpy_d2d", time)
+        elif name == "[CUDA memset]":
+            self._update_memory_time("memset", time)
+        else:
+            self.kernel_time += time
+
+    def _update_memory_time(self, member_name, time):
+        assert member_name in [
+            "memcpy_h2d", "memcpy_d2h", "memcpy_d2d", "memset"
+        ]
+        setattr(self, member_name, time)
+        if member_name != "memcpy_d2h" or not env.benchmark_need_fetch():
+            self.memory_time += time
+
+
 class NvprofRunner(object):
     def run(self, cmd, profile_from_start=False):
         stdout, exit_code = self._nvprof(cmd, profile_from_start)
@@ -61,36 +116,39 @@ def _parse_logs(self, logs):
                 line_to = i
                 break
         if line_from is not None and line_to is not None:
+            time_unit = TimeUnit()
             for i in range(line_from, line_to):
                 print(logs[i])
+                if i >= line_from + 1:
+                    begin_pos = 2 if i == line_from + 1 else 0
+                    gpu_time, percent, function = self._parse_line(logs[i],
+                                                                   begin_pos)
+                    time_unit.add_info(gpu_time, function)
             print("")
-            return True, self._parse_gpu_time(logs[line_from + 1])
+            print(time_unit)
+            return True, time_unit.total()
         else:
             return False, 0.0
 
-    def _parse_gpu_time(self, line):
-        infos = line.strip().split()
-        percent = float(infos[2].replace("%", "")) * 0.01
-        gpu_time = infos[3]
-        if gpu_time.endswith("us"):
-            gpu_time = float(gpu_time.replace("us", "")) * 0.001
-        elif gpu_time.endswith("ms"):
-            gpu_time = float(gpu_time.replace("ms", ""))
-        elif gpu_time.endswith("s"):
-            gpu_time = float(gpu_time.replace("s", "")) * 1000
+    def _to_millisecond(self, timestr):
+        if timestr.endswith("us"):
+            return float(timestr.replace("us", "")) * 0.001
+        elif timestr.endswith("ms"):
+            return float(timestr.replace("ms", ""))
+        elif timestr.endswith("s"):
+            return float(timestr.replace("s", "")) * 1000
         else:
             raise ValueError("Invalid time: %s" % gpu_time)
-        calls = int(infos[4])
-        function = infos[8]
-        for i in range(9, len(infos)):
-            function = function + " " + infos[i]
-        #print("percent: %.2f; gpu_time: %.4f ms; calls: %d; function: %s" %
-        #      (percent, gpu_time, calls, function))
 
-        total_gpu_time = gpu_time / percent
-        print("total gpu_time: %.4f ms" % total_gpu_time)
-        print("")
-        return total_gpu_time
+    def _parse_line(self, line, begin_pos=0):
+        infos = line.strip().split()
+        percent = float(infos[begin_pos].replace("%", "")) * 0.01
+        gpu_time = self._to_millisecond(infos[begin_pos + 1])
+        calls = int(infos[begin_pos + 2])
+        function = infos[begin_pos + 6]
+        for i in range(begin_pos + 7, len(infos)):
+            function = function + " " + infos[i]
+        return gpu_time, percent, function
 
 
 class NsightRunner(object):

From 685bf7077a739882568d49a32b2086890b592ae8 Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Tue, 18 Jan 2022 02:51:32 +0000
Subject: [PATCH 08/12] Add fused_batch_norm_relu scripts.

---
 api/run_op_benchmark.sh               | 19 +++---
 api/tests_v2/batch_norm.py            |  4 +-
 api/tests_v2/fused_batch_norm_relu.py | 89 +++++++++++++++++++++++++++
 3 files changed, 101 insertions(+), 11 deletions(-)
 create mode 100644 api/tests_v2/fused_batch_norm_relu.py

diff --git a/api/run_op_benchmark.sh b/api/run_op_benchmark.sh
index 6208c01a2f..a7fd20948f 100755
--- a/api/run_op_benchmark.sh
+++ b/api/run_op_benchmark.sh
@@ -4,7 +4,8 @@ OP_BENCHMARK_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")" && pwd )"
 
 test_module_name=${1:-"dynamic_tests_v2"}  # "tests_v2", "dynamic_tests_v2"
 gpu_ids=${2:-"0"}
-op_type=${3:-"all"}  # "all" or specified op_type, such as elementwise
+model_name_op_type=${3:-"all"}  # "all" or specified model_name/op_type, such as elementwise
+config_subdir=${4:-"op_configs"}
 
 if [ ${test_module_name} != "tests_v2" ] && [ ${test_module_name} != "dynamic_tests_v2" ]; then
   echo "Please set test_module_name (${test_module_name}) to \"tests_v2\" or \"dynamic_tests_v2\"!"
@@ -63,10 +64,10 @@ run_op_benchmark() {
   bash ${OP_BENCHMARK_ROOT}/deploy/main_control.sh ${tests_dir} ${config_dir} ${output_dir} ${gpu_ids} "both" "both" "none" "both" "${testing_mode}" > ${log_path} 2>&1 &
 }
 
-run_specified_op() {
+run_specified_task() {
   local testing_mode=$1
 
-  OUTPUT_ROOT=${OP_BENCHMARK_ROOT}/logs/${op_type}
+  OUTPUT_ROOT=${OP_BENCHMARK_ROOT}/logs/${model_name_op_type}
   if [ ! -d ${OUTPUT_ROOT} ]; then
     mkdir -p ${OUTPUT_ROOT}
   fi
@@ -79,11 +80,11 @@ run_specified_op() {
   echo "-- output_dir: ${output_dir}"
   
   if [ "${test_module_name}" == "tests" ]; then
-    config_dir=${OP_BENCHMARK_ROOT}/tests/op_configs
-    op_list=${OUTPUT_ROOT}/api_info_${op_type}.txt
+    config_dir=${OP_BENCHMARK_ROOT}/tests/${config_subdir}
+    op_list=${OUTPUT_ROOT}/api_info_${model_name_op_type}.txt
   else
-    config_dir=${OP_BENCHMARK_ROOT}/tests_v2/op_configs
-    op_list=${OUTPUT_ROOT}/api_info_v2_${op_type}.txt
+    config_dir=${OP_BENCHMARK_ROOT}/tests_v2/${config_subdir}
+    op_list=${OUTPUT_ROOT}/api_info_v2_${model_name_op_type}.txt
   fi
   echo "-- config_dir: ${config_dir}"
  
@@ -104,12 +105,12 @@ main() {
     install_package "tensorflow" "2.7.0"
   fi
 
-  case ${op_type} in
+  case ${model_name_op_type} in
     all)
       run_op_benchmark ${testing_mode}
       ;;
     *)
-      run_specified_op ${testing_mode}
+      run_specified_task ${testing_mode}
       ;;
   esac
 }
diff --git a/api/tests_v2/batch_norm.py b/api/tests_v2/batch_norm.py
index ebaa7796bb..43b250a69a 100644
--- a/api/tests_v2/batch_norm.py
+++ b/api/tests_v2/batch_norm.py
@@ -16,8 +16,8 @@
 
 
 class BatchNormConfig(APIConfig):
-    def __init__(self):
-        super(BatchNormConfig, self).__init__('batch_norm')
+    def __init__(self, op_type="batch_norm"):
+        super(BatchNormConfig, self).__init__(op_type)
 
     def init_from_json(self, filename, config_id=0, unknown_dim=16):
         super(BatchNormConfig, self).init_from_json(filename, config_id,
diff --git a/api/tests_v2/fused_batch_norm_relu.py b/api/tests_v2/fused_batch_norm_relu.py
new file mode 100644
index 0000000000..326b972caf
--- /dev/null
+++ b/api/tests_v2/fused_batch_norm_relu.py
@@ -0,0 +1,89 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from common_import import *
+from batch_norm import BatchNormConfig
+
+
+class FusedBatchNormReluConfig(BatchNormConfig):
+    def __init__(self):
+        super(FusedBatchNormReluConfig, self).__init__("fused_batch_norm_relu")
+        self.alias_name = "batch_norm"
+
+
+class PDFusedBatchNormRelu(PaddleAPIBenchmarkBase):
+    def build_program(self, config):
+        def _create_parameter(name, value, stop_gradient):
+            param = paddle.create_parameter(
+                name=name,
+                shape=[config.num_channels],
+                dtype=config.x_dtype,
+                attr=paddle.ParamAttr(
+                    initializer=paddle.nn.initializer.Constant(value)))
+            param.stop_gradient = stop_gradient
+            return param
+
+        x = self.variable(name='x', shape=config.x_shape, dtype=config.x_dtype)
+
+        running_mean = _create_parameter(
+            name='running_mean', value=0.5, stop_gradient=True)
+        running_var = _create_parameter(
+            name='running_var', value=0.1, stop_gradient=True)
+
+        scale = _create_parameter(name='scale', value=0.5, stop_gradient=False)
+        bias = _create_parameter(name='bias', value=0.1, stop_gradient=False)
+
+        bn_out = paddle.nn.functional.batch_norm(
+            x=x,
+            running_mean=running_mean,
+            running_var=running_var,
+            weight=scale,
+            bias=bias,
+            epsilon=config.epsilon,
+            momentum=config.momentum,
+            training=config.training,
+            data_format=config.data_format)
+        relu_out = paddle.nn.functional.relu(bn_out)
+
+        self.feed_vars = [x]
+        self.fetch_vars = [bn_out, relu_out]
+        if config.backward:
+            self.append_gradients(relu_out, [x, scale, bias, bn_out])
+
+
+class TFFusedBatchNormRelu(TensorflowAPIBenchmarkBase):
+    def build_graph(self, config):
+        x = self.variable(name='x', shape=config.x_shape, dtype=config.x_dtype)
+        bn = tf.keras.layers.BatchNormalization(
+            axis=config.axis,
+            momentum=config.momentum,
+            epsilon=config.epsilon,
+            beta_initializer=tf.constant_initializer(0.1),
+            gamma_initializer=tf.constant_initializer(0.5),
+            moving_mean_initializer=tf.constant_initializer(0.5),
+            moving_variance_initializer=tf.constant_initializer(0.1))
+        bn_out = bn(x, training=config.training)
+        relu_out = tf.nn.relu(bn_out)
+
+        self.feed_list = [x]
+        self.fetch_list = [bn_out, relu_out]
+        if config.backward:
+            self.append_gradients(relu_out, [x, bn.gamma, bn.beta, bn_out])
+
+
+if __name__ == '__main__':
+    test_main(
+        PDFusedBatchNormRelu(),
+        TFFusedBatchNormRelu(),
+        config=FusedBatchNormReluConfig())

From 4438014a5775b0f222e9168617faec10f8e3e671 Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Wed, 19 Jan 2022 06:49:32 +0000
Subject: [PATCH 09/12] Does not test the fused_xxx ops in ci.

---
 api/deploy/collect_api_info.py | 10 ++++++++--
 ci/scripts/run_test.sh         |  1 +
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/api/deploy/collect_api_info.py b/api/deploy/collect_api_info.py
index ee13bcd606..fa935d2d8f 100644
--- a/api/deploy/collect_api_info.py
+++ b/api/deploy/collect_api_info.py
@@ -50,8 +50,14 @@ def collect_subclass_dict(test_cases_dict):
 
 
 def import_all_tests(test_module_name):
+    def _is_special_module(api_name):
+        special_module_list = ["__init__", "common_import", "fused_"]
+        for name in special_module_list:
+            if name in api_name:
+                return True
+        return False
+
     test_cases_dict = {}
-    special_module_list = ["__init__", "common_import"]
 
     def _import_api(test_module_name, basename):
         try:
@@ -66,7 +72,7 @@ def _import_api(test_module_name, basename):
     for filename in sorted(os.listdir(tests_path)):
         api_name = os.path.splitext(filename)[0]
         file_extension = os.path.splitext(filename)[1]
-        if file_extension == '.py' and api_name not in special_module_list:
+        if file_extension == '.py' and _is_special_module(api_name):
             module = _import_api(test_module_name, api_name)
             if module:
                 test_cases_dict[api_name] = module
diff --git a/ci/scripts/run_test.sh b/ci/scripts/run_test.sh
index 8975ecb98b..3c0f288b04 100644
--- a/ci/scripts/run_test.sh
+++ b/ci/scripts/run_test.sh
@@ -74,6 +74,7 @@ function run_api(){
     LOG "[INFO] Found ${file} modified."
     api=${file#*api/} && api=${api%.*}
     [[ "$api" =~ "common_import" ]] && continue
+    [[ "$api" =~ "fused_" ]] && continue
     [ -f "${BENCHMARK_ROOT}/api/${api}.py" ] && API_NAMES[${#API_NAMES[@]}]=$api
     if [[ "$file" =~ ".json" ]]
     then

From f477069b6ebbf494e43bd933c480b575aa4a8368 Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Fri, 21 Jan 2022 06:32:40 +0000
Subject: [PATCH 10/12] Add fused_batch_norm_add_relu and fix a bug.

---
 api/deploy/collect_api_info.py            |  4 +-
 api/tests_v2/fused_batch_norm_add_relu.py | 95 +++++++++++++++++++++++
 2 files changed, 97 insertions(+), 2 deletions(-)
 create mode 100644 api/tests_v2/fused_batch_norm_add_relu.py

diff --git a/api/deploy/collect_api_info.py b/api/deploy/collect_api_info.py
index fa935d2d8f..b0b91163df 100644
--- a/api/deploy/collect_api_info.py
+++ b/api/deploy/collect_api_info.py
@@ -72,7 +72,7 @@ def _import_api(test_module_name, basename):
     for filename in sorted(os.listdir(tests_path)):
         api_name = os.path.splitext(filename)[0]
         file_extension = os.path.splitext(filename)[1]
-        if file_extension == '.py' and _is_special_module(api_name):
+        if file_extension == '.py' and not _is_special_module(api_name):
             module = _import_api(test_module_name, api_name)
             if module:
                 test_cases_dict[api_name] = module
@@ -138,7 +138,7 @@ def main(args):
     parser.add_argument(
         '--test_module_name',
         type=str,
-        default="tests",
+        default="tests_v2",
         help='The module_name under benchmark/api (tests|tests_v2|dynamic_tests_v2).'
     )
     parser.add_argument(
diff --git a/api/tests_v2/fused_batch_norm_add_relu.py b/api/tests_v2/fused_batch_norm_add_relu.py
new file mode 100644
index 0000000000..7ba239c613
--- /dev/null
+++ b/api/tests_v2/fused_batch_norm_add_relu.py
@@ -0,0 +1,95 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from common_import import *
+from batch_norm import BatchNormConfig
+
+
+class FusedBatchNormAddReluConfig(BatchNormConfig):
+    def __init__(self):
+        super(FusedBatchNormAddReluConfig,
+              self).__init__("fused_batch_norm_add_relu")
+        self.alias_name = "batch_norm"
+
+
+class PDFusedBatchNormAddRelu(PaddleAPIBenchmarkBase):
+    def build_program(self, config):
+        def _create_parameter(name, value, stop_gradient):
+            param = paddle.create_parameter(
+                name=name,
+                shape=[config.num_channels],
+                dtype=config.x_dtype,
+                attr=paddle.ParamAttr(
+                    initializer=paddle.nn.initializer.Constant(value)))
+            param.stop_gradient = stop_gradient
+            return param
+
+        x = self.variable(name='x', shape=config.x_shape, dtype=config.x_dtype)
+        y = self.variable(name='y', shape=config.x_shape, dtype=config.x_dtype)
+
+        running_mean = _create_parameter(
+            name='running_mean', value=0.5, stop_gradient=True)
+        running_var = _create_parameter(
+            name='running_var', value=0.1, stop_gradient=True)
+
+        scale = _create_parameter(name='scale', value=0.5, stop_gradient=False)
+        bias = _create_parameter(name='bias', value=0.1, stop_gradient=False)
+
+        bn_out = paddle.nn.functional.batch_norm(
+            x=x,
+            running_mean=running_mean,
+            running_var=running_var,
+            weight=scale,
+            bias=bias,
+            epsilon=config.epsilon,
+            momentum=config.momentum,
+            training=config.training,
+            data_format=config.data_format)
+        add_out = bn_out + y
+        relu_out = paddle.nn.functional.relu(add_out)
+
+        self.feed_vars = [x, y]
+        self.fetch_vars = [bn_out, add_out, relu_out]
+        if config.backward:
+            self.append_gradients(relu_out, [x, scale, bias, bn_out, add_out])
+
+
+class TFFusedBatchNormAddRelu(TensorflowAPIBenchmarkBase):
+    def build_graph(self, config):
+        x = self.variable(name='x', shape=config.x_shape, dtype=config.x_dtype)
+        y = self.variable(name='y', shape=config.x_shape, dtype=config.x_dtype)
+        bn = tf.keras.layers.BatchNormalization(
+            axis=config.axis,
+            momentum=config.momentum,
+            epsilon=config.epsilon,
+            beta_initializer=tf.constant_initializer(0.1),
+            gamma_initializer=tf.constant_initializer(0.5),
+            moving_mean_initializer=tf.constant_initializer(0.5),
+            moving_variance_initializer=tf.constant_initializer(0.1))
+        bn_out = bn(x, training=config.training)
+        add_out = bn_out + y
+        relu_out = tf.nn.relu(add_out)
+
+        self.feed_list = [x, y]
+        self.fetch_list = [bn_out, add_out, relu_out]
+        if config.backward:
+            self.append_gradients(relu_out,
+                                  [x, bn.gamma, bn.beta, bn_out, add_out])
+
+
+if __name__ == '__main__':
+    test_main(
+        PDFusedBatchNormAddRelu(),
+        TFFusedBatchNormAddRelu(),
+        config=FusedBatchNormAddReluConfig())

From ec29e7262331c8e3b06cc07aa607af24b146d6d7 Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Mon, 21 Feb 2022 02:37:48 +0000
Subject: [PATCH 11/12] Update tf'version to 2.8.0.

---
 api/run_op_benchmark.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/api/run_op_benchmark.sh b/api/run_op_benchmark.sh
index a7fd20948f..156a92a65f 100755
--- a/api/run_op_benchmark.sh
+++ b/api/run_op_benchmark.sh
@@ -102,7 +102,7 @@ main() {
     install_package "torch" "1.10.0"
   else
     testing_mode="static"
-    install_package "tensorflow" "2.7.0"
+    install_package "tensorflow" "2.8.0"
   fi
 
   case ${model_name_op_type} in

From 6678cfeef560d8b0b34d285059a2d3d3722131e5 Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Wed, 2 Mar 2022 09:09:18 +0000
Subject: [PATCH 12/12] Change copyright.

---
 api/common/env.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/api/common/env.py b/api/common/env.py
index 00d82a33ba..ec84f19785 100644
--- a/api/common/env.py
+++ b/api/common/env.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.