From 9801241c4fdffa6baddb28abe4ca99390f78a91f Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Tue, 4 Jan 2022 13:13:51 +0000 Subject: [PATCH 01/12] Update the supported tf'version to 2.7.0 and allow precisely profiling. --- api/common/launch.py | 41 ++++++++----- api/common/tensorflow_api_benchmark.py | 32 ++++++---- api/common/utils.py | 18 +++--- api/run_op_benchmark.sh | 2 +- api/tests_v2/batch_norm.py | 82 +++++++++++--------------- api/tests_v2/configs/batch_norm.json | 6 +- 6 files changed, 96 insertions(+), 85 deletions(-) diff --git a/api/common/launch.py b/api/common/launch.py index 623de5caf6..463a9710cd 100644 --- a/api/common/launch.py +++ b/api/common/launch.py @@ -34,8 +34,8 @@ def is_ampere_gpu(): class NvprofRunner(object): - def run(self, cmd): - stdout, exit_code = self._nvprof(cmd) + def run(self, cmd, profile_from_start=False): + stdout, exit_code = self._nvprof(cmd, profile_from_start) if exit_code == 0: parse_status, gpu_time = self._parse_logs(stdout.split("\n")) if parse_status: @@ -43,9 +43,12 @@ def run(self, cmd): print("Running Error:\n {}".format(stdout)) return 0.0 - def _nvprof(self, cmd): - return system.run_command("nvprof --profile-from-start off {}".format( - cmd)) + def _nvprof(self, cmd, profile_from_start): + if profile_from_start: + profile_cmd = "nvprof {}".format(cmd) + else: + profile_cmd = "nvprof --profile-from-start off {}".format(cmd) + return system.run_command(profile_cmd) def _parse_logs(self, logs): line_from = None @@ -91,8 +94,8 @@ def _parse_gpu_time(self, line): class NsightRunner(object): - def run(self, cmd): - stdout, exit_code = self._nsight(cmd) + def run(self, cmd, profile_from_start=False): + stdout, exit_code = self._nsight(cmd, profile_from_start) if exit_code == 0: parse_status, gpu_time = self._parse_logs(stdout.split("\n")) if parse_status: @@ -100,9 +103,13 @@ def run(self, cmd): print("Running Error:\n {}".format(stdout)) return 0.0 - def _nsight(self, cmd): - return system.run_command( - "nsys nvprof --profile-from-start=off -o tmp.qdrep {}".format(cmd)) + def _nsight(self, cmd, profile_from_start): + if profile_from_start: + profile_cmd = "nsys nvprof -o tmp.qdrep {}".format(cmd) + else: + profile_cmd = "nsys nvprof --profile-from-start=off -o tmp.qdrep {}".format( + cmd) + return system.run_command(profile_cmd) def _parse_logs(self, logs): kernel_line_from = None @@ -168,7 +175,10 @@ def _parse_gpu_time(self, line): return gpu_time / percent -def launch(benchmark_script, benchmark_script_args, with_nvprof=False): +def launch(benchmark_script, + benchmark_script_args, + with_nvprof=False, + profile_from_start=True): """ If with_nvprof is True, it will launch the following command firstly to get the gpu_time: @@ -188,7 +198,7 @@ def _set_profiler(args, value): args.append("--profiler") args.append(value) - if with_nvprof: + if with_nvprof and not profile_from_start: _set_profiler(benchmark_script_args, "nvprof") cmd = "{} {} {}".format(sys.executable, benchmark_script, " ".join(benchmark_script_args)) @@ -197,7 +207,7 @@ def _set_profiler(args, value): runner = NsightRunner() else: runner = NvprofRunner() - gpu_time = runner.run(cmd) + gpu_time = runner.run(cmd, profile_from_start) _set_profiler(benchmark_script_args, "none") return gpu_time else: @@ -234,6 +244,7 @@ def _args_list_to_dict(arg_list): args = parser.parse_args() benchmark_args_dict = _args_list_to_dict(args.benchmark_script_args) task = benchmark_args_dict.get("task", "speed") + framework = benchmark_args_dict.get("framework", "paddle") use_gpu = system.str2bool(benchmark_args_dict.get( "use_gpu", "False")) and os.environ.get("CUDA_VISIBLE_DEVICES", None) != "" @@ -243,10 +254,12 @@ def _args_list_to_dict(arg_list): system.check_commit() if use_gpu and task == "speed" and profiler == "none": + profile_from_start = False total_gpu_time = launch( args.benchmark_script, args.benchmark_script_args, - with_nvprof=True) + with_nvprof=True, + profile_from_start=profile_from_start) args.benchmark_script_args.append(" --gpu_time ") args.benchmark_script_args.append(str(total_gpu_time)) diff --git a/api/common/tensorflow_api_benchmark.py b/api/common/tensorflow_api_benchmark.py index 9e3a65f01b..eb33a22285 100644 --- a/api/common/tensorflow_api_benchmark.py +++ b/api/common/tensorflow_api_benchmark.py @@ -40,21 +40,25 @@ class Profiler(object): def __init__(self, name, sess, profiler): self.name = name - self.sess = sess self.profiler = profiler - self.profiler_handle = None + self._sess = sess + self._profiler_handle = None self.run_options = None self.run_metadata = None self.generate_timeline = False def __enter__(self): - if self.profiler == "pyprof": + if self.profiler == "nvprof": + import ctypes + self._cudart = ctypes.CDLL('libcudart.so') + self._cudart.cudaProfilerStart() + elif self.profiler == "pyprof": import cProfile - self.profiler_handle = cProfile.Profile() - self.profiler_handle.enable() - elif self.profiler != "none": - self.profiler_handle = model_analyzer.Profiler( - graph=self.sess.graph) + self._profiler_handle = cProfile.Profile() + self._profiler_handle.enable() + elif self.profiler == "native": + self._profiler_handle = model_analyzer.Profiler( + graph=self._sess.graph) if tf.__version__ < "1.15.0": self.run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) @@ -66,9 +70,9 @@ def __enter__(self): return self def add_step(self, step): - if self.profiler != "none" and self.profiler != "pyprof": + if self.profiler == "native": # Update profiler - self.profiler_handle.add_step( + self._profiler_handle.add_step( step=step, run_meta=self.run_metadata) if self.generate_timeline: # For timeline @@ -78,16 +82,18 @@ def add_step(self, step): trace_file.write(chrome_trace) def __exit__(self, exception_type, exception_value, traceback): - if self.profiler == "pyprof": + if self.profiler == "nvprof": + self._cudart.cudaProfilerStop() + elif self.profiler == "pyprof": import pstats, StringIO - self.profiler_handle.disable() + self._profiler_handle.disable() # self.profiler_handle.dump_stats("./outputs/" + self.name + ".pyprof") s = StringIO.StringIO() ps = pstats.Stats( self.profiler_handle, stream=s).sort_stats("cumulative") ps.print_stats() print(s.getvalue()) - elif self.profiler != "none": + elif self.profiler == "native": # Generate profiling result profile_op_builder = option_builder.ProfileOptionBuilder().select( ['micros', 'occurrence']).order_by('micros').with_max_depth(5) diff --git a/api/common/utils.py b/api/common/utils.py index db0b6d8cc4..a6e81eff71 100644 --- a/api/common/utils.py +++ b/api/common/utils.py @@ -176,13 +176,17 @@ def check_outputs(output_list, target = target_list[i] if testing_mode == "static": - if isinstance( - target, - tf.python.framework.indexed_slices.IndexedSlicesValue): - print( - "---- Warning: Th %d-th target's type is IndexedSlicesValue and the check is skipped. " - "It will be fixed later." % i) - continue + try: + if isinstance(target, tf.python.framework.indexed_slices. + IndexedSlicesValue): + print( + "---- Warning: Th %d-th target's type is IndexedSlicesValue and the check is skipped. " + "It will be fixed later." % i) + continue + except Exception as e: + if tf.__version__ < "2.4.0": + # I am not sure about the exact version + print("Meets an exception: {}".format(e)) output, target = _check_type(output, target) output, target = _check_shape(name, output, target, i) diff --git a/api/run_op_benchmark.sh b/api/run_op_benchmark.sh index 42d5e993c2..a754353511 100755 --- a/api/run_op_benchmark.sh +++ b/api/run_op_benchmark.sh @@ -101,7 +101,7 @@ main() { install_package "torch" "1.10.0" else testing_mode="static" - install_package "tensorflow" "2.3.1" + install_package "tensorflow" "2.7.0" fi case ${op_type} in diff --git a/api/tests_v2/batch_norm.py b/api/tests_v2/batch_norm.py index 3b786e42d2..ebaa7796bb 100644 --- a/api/tests_v2/batch_norm.py +++ b/api/tests_v2/batch_norm.py @@ -22,14 +22,6 @@ def __init__(self): def init_from_json(self, filename, config_id=0, unknown_dim=16): super(BatchNormConfig, self).init_from_json(filename, config_id, unknown_dim) - # tf's batch_norm does not have data_format param, it only support NHWC format. - if self.data_format == "NCHW": - print( - "Warning:\n" - " 1. tf's batch_norm does not have data_format param, it only support NHWC format.\n" - ) - self.run_tf = False - if len(self.x_shape) == 4: if self.data_format == "NCHW": self.num_channels = self.x_shape[1] @@ -40,49 +32,47 @@ def init_from_json(self, filename, config_id=0, unknown_dim=16): def to_tensorflow(self): tf_config = super(BatchNormConfig, self).to_tensorflow() - if len(tf_config.x_shape) == 4: - tf_config.axes = [0, 1, 2] + if len(self.x_shape) == 4: + tf_config.axis = 1 if self.data_format == "NCHW" else 3 else: - tf_config.axes = [0] + tf_config.axis = 1 return tf_config class PDBatchNorm(PaddleAPIBenchmarkBase): def build_program(self, config): + def _create_parameter(name, value, stop_gradient): + param = paddle.create_parameter( + name=name, + shape=[config.num_channels], + dtype=config.x_dtype, + attr=paddle.ParamAttr( + initializer=paddle.nn.initializer.Constant(value))) + param.stop_gradient = stop_gradient + return param + x = self.variable(name='x', shape=config.x_shape, dtype=config.x_dtype) - running_mean = paddle.create_parameter( - name='running_mean', - shape=[config.num_channels], - dtype=config.x_dtype, - attr=paddle.ParamAttr( - initializer=paddle.nn.initializer.Constant(0.5))) - running_mean.stop_gradient = True - running_var = paddle.create_parameter( - name='running_var', - shape=[config.num_channels], - dtype=config.x_dtype, - attr=paddle.ParamAttr( - initializer=paddle.nn.initializer.Constant(0.1))) - running_var.stop_gradient = True - - scale = self.variable( - name='scale', shape=[config.num_channels], dtype=config.x_dtype) - bias = self.variable( - name='bias', shape=[config.num_channels], dtype=config.x_dtype) + running_mean = _create_parameter( + name='running_mean', value=0.5, stop_gradient=True) + running_var = _create_parameter( + name='running_var', value=0.1, stop_gradient=True) + + scale = _create_parameter(name='scale', value=0.5, stop_gradient=False) + bias = _create_parameter(name='bias', value=0.1, stop_gradient=False) result = paddle.nn.functional.batch_norm( x=x, running_mean=running_mean, running_var=running_var, - weight=scale, # scale - bias=bias, # bias + weight=scale, + bias=bias, epsilon=config.epsilon, momentum=config.momentum, training=config.training, data_format=config.data_format) - self.feed_vars = [x, scale, bias] + self.feed_vars = [x] self.fetch_vars = [result] if config.backward: self.append_gradients(result, [x, scale, bias]) @@ -91,24 +81,20 @@ def build_program(self, config): class TFBatchNorm(TensorflowAPIBenchmarkBase): def build_graph(self, config): x = self.variable(name='x', shape=config.x_shape, dtype=config.x_dtype) - scale = self.variable( - name='scale', shape=[config.num_channels], dtype=config.x_dtype) - bias = self.variable( - name='bias', shape=[config.num_channels], dtype=config.x_dtype) - mean, var = tf.nn.moments( - x=x, axes=config.axes, shift=None, keepdims=False) - result = tf.nn.batch_normalization( - x=x, - mean=mean, - variance=var, - offset=bias, - scale=scale, - variance_epsilon=config.epsilon) + bn = tf.keras.layers.BatchNormalization( + axis=config.axis, + momentum=config.momentum, + epsilon=config.epsilon, + beta_initializer=tf.constant_initializer(0.1), + gamma_initializer=tf.constant_initializer(0.5), + moving_mean_initializer=tf.constant_initializer(0.5), + moving_variance_initializer=tf.constant_initializer(0.1)) + result = bn(x, training=config.training) - self.feed_list = [x, scale, bias] + self.feed_list = [x] self.fetch_list = [result] if config.backward: - self.append_gradients(result, [x, scale, bias]) + self.append_gradients(result, [x, bn.gamma, bn.beta]) if __name__ == '__main__': diff --git a/api/tests_v2/configs/batch_norm.json b/api/tests_v2/configs/batch_norm.json index d54219ae78..10e8e01218 100644 --- a/api/tests_v2/configs/batch_norm.json +++ b/api/tests_v2/configs/batch_norm.json @@ -23,7 +23,8 @@ "type": "float", "value": "0.9" } - } + }, + "atol": 1E-5 }, { "config_id": 1, "op": "batch_norm", @@ -49,7 +50,8 @@ "type": "float", "value": "0.9" } - } + }, + "atol": 1E-4 }, { "config_id": 2, "op": "batch_norm", From 6702a92b7be821acd925c14ef5f9fab65c7edadd Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Tue, 4 Jan 2022 14:09:31 +0000 Subject: [PATCH 02/12] Remove the support of tf under 1.15.0. --- api/common/tensorflow_api_benchmark.py | 72 +++++++++++--------------- api/tests_v2/while_loop.py | 22 +++----- 2 files changed, 37 insertions(+), 57 deletions(-) diff --git a/api/common/tensorflow_api_benchmark.py b/api/common/tensorflow_api_benchmark.py index eb33a22285..de22b091a4 100644 --- a/api/common/tensorflow_api_benchmark.py +++ b/api/common/tensorflow_api_benchmark.py @@ -28,6 +28,7 @@ try: import tensorflow as tf + from tensorflow.python.profiler import model_analyzer from tensorflow.python.profiler import option_builder from tensorflow.core.protobuf import config_pb2 @@ -59,14 +60,9 @@ def __enter__(self): elif self.profiler == "native": self._profiler_handle = model_analyzer.Profiler( graph=self._sess.graph) - if tf.__version__ < "1.15.0": - self.run_options = tf.RunOptions( - trace_level=tf.RunOptions.FULL_TRACE) - self.run_metadata = tf.RunMetadata() - else: - self.run_options = tf.compat.v1.RunOptions( - trace_level=tf.compat.v1.RunOptions.FULL_TRACE) - self.run_metadata = tf.compat.v1.RunMetadata() + self.run_options = tf.compat.v1.RunOptions( + trace_level=tf.compat.v1.RunOptions.FULL_TRACE) + self.run_metadata = tf.compat.v1.RunMetadata() return self def add_step(self, step): @@ -90,14 +86,15 @@ def __exit__(self, exception_type, exception_value, traceback): # self.profiler_handle.dump_stats("./outputs/" + self.name + ".pyprof") s = StringIO.StringIO() ps = pstats.Stats( - self.profiler_handle, stream=s).sort_stats("cumulative") + self._profiler_handle, stream=s).sort_stats("cumulative") ps.print_stats() print(s.getvalue()) elif self.profiler == "native": # Generate profiling result profile_op_builder = option_builder.ProfileOptionBuilder().select( ['micros', 'occurrence']).order_by('micros').with_max_depth(5) - self.profiler_handle.profile_operations(profile_op_builder.build()) + self._profiler_handle.profile_operations(profile_op_builder.build( + )) return self @@ -167,8 +164,9 @@ def __init__(self): try: import tensorflow as tf self.graph = tf.Graph() - if tf.__version__ > "1.15.0": - tf.compat.v1.disable_eager_execution() + assert tf.__version__ >= "1.15.0", "The installed tensorflow's version is expected to be newer than 1.15.0, but recieved {}".format( + tf.__version__) + tf.compat.v1.disable_eager_execution() except Exception as e: sys.stderr.write( "Cannot import tensorflow, maybe tensorflow is not installed.\n" @@ -180,11 +178,7 @@ def build_graph(self, config=None): def placeholder(self, name, shape, dtype): tf_dtype = tf.as_dtype(dtype) - if tf.__version__ >= "1.15.0": - var = tf.compat.v1.placeholder( - name=name, shape=shape, dtype=tf_dtype) - else: - var = tf.placeholder(name=name, shape=shape, dtype=tf_dtype) + var = tf.compat.v1.placeholder(name=name, shape=shape, dtype=tf_dtype) return var def variable(self, name, shape, dtype, value=None): @@ -272,8 +266,6 @@ def _run_null_graph(self, use_gpu, repeat): def run_impl(self, use_gpu, feed, repeat=1, profiler="none"): sess = self._init_session(use_gpu) - #tf.debugging.set_log_device_placement(True) - def _run_main_iter(run_options=None, run_metadata=None): feed_dict = feed if self._need_feed else None if self._need_fetch: @@ -378,31 +370,27 @@ def run(self, config, args, use_feed_fetch=True, feeder_adapter=None): self.fetch_list = fetch_list self.allow_growth = False if args.task == "speed" else True - outputs, stats = self.run_impl( - use_gpu=args.use_gpu, - feed=feed, - repeat=args.repeat, - profiler=args.profiler) + device = "GPU:0" if args.use_gpu else "CPU" + with tf.device(device): + outputs, stats = self.run_impl( + use_gpu=args.use_gpu, + feed=feed, + repeat=args.repeat, + profiler=args.profiler) return outputs, stats def _init_session(self, use_gpu): - if tf.__version__ >= "1.15.0": - config = tf.compat.v1.ConfigProto() - if use_gpu: - config.gpu_options.allow_growth = self.allow_growth - else: - # In default, TF use full cpu cores, but Paddle use one cpu core. - # To make the same experiment, set TF use one cpu core as well. - # See https://github.com/PaddlePaddle/Paddle/issues/18665#issuecomment-513780210 - config.intra_op_parallelism_threads = 1 - config.inter_op_parallelism_threads = 1 - sess = tf.compat.v1.Session(config=config) - sess.run(tf.compat.v1.global_variables_initializer()) - sess.run(tf.compat.v1.local_variables_initializer()) - else: - config = tf.ConfigProto() + config = tf.compat.v1.ConfigProto() + if use_gpu: config.gpu_options.allow_growth = self.allow_growth - sess = tf.Session(config=config) - sess.run(tf.global_variables_initializer()) - sess.run(tf.local_variables_initializer()) + config.graph_options.optimizer_options.global_jit_level = 2 + else: + # In default, TF use full cpu cores, but Paddle use one cpu core. + # To make the same experiment, set TF use one cpu core as well. + # See https://github.com/PaddlePaddle/Paddle/issues/18665#issuecomment-513780210 + config.intra_op_parallelism_threads = 1 + config.inter_op_parallelism_threads = 1 + sess = tf.compat.v1.Session(config=config) + sess.run(tf.compat.v1.global_variables_initializer()) + sess.run(tf.compat.v1.local_variables_initializer()) return sess diff --git a/api/tests_v2/while_loop.py b/api/tests_v2/while_loop.py index 95d471d828..bba3b22d0f 100644 --- a/api/tests_v2/while_loop.py +++ b/api/tests_v2/while_loop.py @@ -73,21 +73,13 @@ def cond(i, loop_len, input, result): return i < loop_len def body(i, loop_len, input, result): - if tf.__version__ <= "1.15.0": - result = tf.contrib.layers.fully_connected( - inputs=input, - num_outputs=config.size, - weights_initializer=tf.constant_initializer(0.5), - biases_initializer=tf.constant_initializer(0.1), - activation_fn=None) - else: - result = tf.compat.v1.layers.dense( - inputs=input, - units=config.size, - activation=None, - use_bias=True, - kernel_initializer=tf.constant_initializer(0.5), - bias_initializer=tf.constant_initializer(0.1)) + result = tf.compat.v1.layers.dense( + inputs=input, + units=config.size, + activation=None, + use_bias=True, + kernel_initializer=tf.constant_initializer(0.5), + bias_initializer=tf.constant_initializer(0.1)) return [i + 1, loop_len, input, result] input = self.variable( From f4ed56fb2279637553a2973d73cc692f6d306baf Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Tue, 4 Jan 2022 14:18:25 +0000 Subject: [PATCH 03/12] Allow the control the benchmark through environ. --- api/common/env.py | 23 +++++++++++++++++++++++ api/common/tensorflow_api_benchmark.py | 7 +++++-- 2 files changed, 28 insertions(+), 2 deletions(-) create mode 100644 api/common/env.py diff --git a/api/common/env.py b/api/common/env.py new file mode 100644 index 0000000000..00d82a33ba --- /dev/null +++ b/api/common/env.py @@ -0,0 +1,23 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + + +def benchmark_need_feed(): + return os.environ.get("BENCHMARK_NEED_FEED", False) + + +def benchmark_need_fetch(): + return os.environ.get("BENCHMARK_NEED_FETCH", False) diff --git a/api/common/tensorflow_api_benchmark.py b/api/common/tensorflow_api_benchmark.py index de22b091a4..5ac07d5d97 100644 --- a/api/common/tensorflow_api_benchmark.py +++ b/api/common/tensorflow_api_benchmark.py @@ -22,6 +22,7 @@ import numpy as np from common import special_op_list +from . import env from . import utils from . import api_param from . import feeder @@ -323,8 +324,10 @@ def generate_random_feeder(self, assert use_feed_fetch, "Argument use_feed_fetch must be True when feeder_adapter is initialized by paddle." if feeder_adapter is None or feeder_adapter.framework != "tensorflow": - self._need_feed = config.name == "feed" - self._need_fetch = use_feed_fetch or config.name == "fetch" + self._need_feed = env.benchmark_need_feed( + ) or config.name == "feed" + self._need_fetch = env.benchmark_need_fetch( + ) or use_feed_fetch or config.name == "fetch" self._feed_spec = feeder.copy_feed_spec(config.feed_spec) self._feed_dict = {} From 6ec9b0f6b2680dd2ccf9b628dd95218d3043e79d Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Wed, 5 Jan 2022 03:11:46 +0000 Subject: [PATCH 04/12] Remove the set of optimizer options and fix showing bugs when writing the summary results to excel. --- api/common/tensorflow_api_benchmark.py | 1 - api/deploy/write_excel.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/api/common/tensorflow_api_benchmark.py b/api/common/tensorflow_api_benchmark.py index 5ac07d5d97..3978ce769f 100644 --- a/api/common/tensorflow_api_benchmark.py +++ b/api/common/tensorflow_api_benchmark.py @@ -386,7 +386,6 @@ def _init_session(self, use_gpu): config = tf.compat.v1.ConfigProto() if use_gpu: config.gpu_options.allow_growth = self.allow_growth - config.graph_options.optimizer_options.global_jit_level = 2 else: # In default, TF use full cpu cores, but Paddle use one cpu core. # To make the same experiment, set TF use one cpu core as well. diff --git a/api/deploy/write_excel.py b/api/deploy/write_excel.py index 3bcb9f1a3c..18e4726c80 100644 --- a/api/deploy/write_excel.py +++ b/api/deploy/write_excel.py @@ -191,12 +191,12 @@ def _write_title_and_set_column_width(worksheet, device, compare_framework, column_width.append(16) column_width.append(16) column_width.append(16) - if device == "gpu" and direction == "forward": + if device == "gpu" and direction in ["forward", "backward"]: title_names.append("paddle(gflops)") title_names.append("paddle(gbs)") title_names.append("accuracy") title_names.append("parameters") - if device == "gpu" and direction == "forward": + if device == "gpu" and direction in ["forward", "backward"]: column_width.append(16) column_width.append(16) column_width.append(10) From fac27de6fd71bcb42ae7a73244056fc711230d0d Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Wed, 5 Jan 2022 14:06:37 +0000 Subject: [PATCH 05/12] Remove argsort from no_backward_ops list. --- api/common/special_op_list.py | 1 - 1 file changed, 1 deletion(-) diff --git a/api/common/special_op_list.py b/api/common/special_op_list.py index cf4332a4f8..bd4409a759 100644 --- a/api/common/special_op_list.py +++ b/api/common/special_op_list.py @@ -36,7 +36,6 @@ "arange", "argmax", "argmin", - "argsort", "assign", "cast", "clip_by_norm", From aac10741eb93ccfd1ca4034438991f9f1d104e22 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Thu, 6 Jan 2022 03:12:36 +0000 Subject: [PATCH 06/12] Change the approveal github ids. --- ci/scripts/check_approval.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/check_approval.sh b/ci/scripts/check_approval.sh index a870d6ccbf..9fab441968 100644 --- a/ci/scripts/check_approval.sh +++ b/ci/scripts/check_approval.sh @@ -25,7 +25,7 @@ BENCHMARK_ROOT=$(cd $(dirname $0)/../../ && pwd) declare -A FILE_APPROVAL_USER_MAP FILE_APPROVAL_USER_MAP=( - ["api/common/special_op_list.py"]="GaoWei8 wangchaochaohu zhangting2020" + ["api/common/special_op_list.py"]="JamesLim-sy ZzSean zhangting2020" ) LOG "[INFO] Get approval list ..." From cf162b2406b703d7d7f7a4a5c974f77de668ecb6 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Fri, 7 Jan 2022 10:03:38 +0000 Subject: [PATCH 07/12] Exclude the DtoH time when needs fetch. --- api/common/launch.py | 100 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 79 insertions(+), 21 deletions(-) diff --git a/api/common/launch.py b/api/common/launch.py index 463a9710cd..d3e9da005f 100644 --- a/api/common/launch.py +++ b/api/common/launch.py @@ -18,6 +18,7 @@ import sys import argparse +from common import env from common import system from common import api_param @@ -33,6 +34,60 @@ def is_ampere_gpu(): return False +class TimeUnit(object): + def __init__(self): + self.kernel_time = 0.0 + self.memory_time = 0.0 + self.memcpy_h2d = 0.0 + self.memcpy_d2h = 0.0 + self.memcpy_d2d = 0.0 + self.memset = 0.0 + + def total(self): + self.memory_time = self.memcpy_h2d + self.memcpy_d2d + self.memset + if not env.benchmark_need_fetch(): + # Normally DtoH is fetching results. + self.memory_time += self.memcpy_d2h + return self.kernel_time + self.memory_time + + def __str__(self): + total_time = self.total() + if env.benchmark_need_fetch(): + infostr = "total gpu_time (exclude DtoH): {:.4f} ms ".format( + total_time) + else: + infostr = "total gpu_time: {:.4f} ms ".format(total_time) + if total_time > 0.0: + infostr += "(kernel: {:.4f} ms ({:.2f}%); memory: {:.4f} ms ({:.2f}%))".format( + self.kernel_time, self.kernel_time * 100 / total_time, + self.memory_time, self.memory_time * 100 / total_time) + else: + infostr += "(kernel: {:.4f} ms; memory: {:.4f} ms)".format( + self.kernel_time, self.memory_time) + infostr += "\n" + return infostr + + def add_info(self, time, name): + if name == "[CUDA memcpy HtoD]": + self._update_memory_time("memcpy_h2d", time) + elif name == "[CUDA memcpy DtoH]": + self._update_memory_time("memcpy_d2h", time) + elif name == "[CUDA memcpy DtoD]": + self._update_memory_time("memcpy_d2d", time) + elif name == "[CUDA memset]": + self._update_memory_time("memset", time) + else: + self.kernel_time += time + + def _update_memory_time(self, member_name, time): + assert member_name in [ + "memcpy_h2d", "memcpy_d2h", "memcpy_d2d", "memset" + ] + setattr(self, member_name, time) + if member_name != "memcpy_d2h" or not env.benchmark_need_fetch(): + self.memory_time += time + + class NvprofRunner(object): def run(self, cmd, profile_from_start=False): stdout, exit_code = self._nvprof(cmd, profile_from_start) @@ -61,36 +116,39 @@ def _parse_logs(self, logs): line_to = i break if line_from is not None and line_to is not None: + time_unit = TimeUnit() for i in range(line_from, line_to): print(logs[i]) + if i >= line_from + 1: + begin_pos = 2 if i == line_from + 1 else 0 + gpu_time, percent, function = self._parse_line(logs[i], + begin_pos) + time_unit.add_info(gpu_time, function) print("") - return True, self._parse_gpu_time(logs[line_from + 1]) + print(time_unit) + return True, time_unit.total() else: return False, 0.0 - def _parse_gpu_time(self, line): - infos = line.strip().split() - percent = float(infos[2].replace("%", "")) * 0.01 - gpu_time = infos[3] - if gpu_time.endswith("us"): - gpu_time = float(gpu_time.replace("us", "")) * 0.001 - elif gpu_time.endswith("ms"): - gpu_time = float(gpu_time.replace("ms", "")) - elif gpu_time.endswith("s"): - gpu_time = float(gpu_time.replace("s", "")) * 1000 + def _to_millisecond(self, timestr): + if timestr.endswith("us"): + return float(timestr.replace("us", "")) * 0.001 + elif timestr.endswith("ms"): + return float(timestr.replace("ms", "")) + elif timestr.endswith("s"): + return float(timestr.replace("s", "")) * 1000 else: raise ValueError("Invalid time: %s" % gpu_time) - calls = int(infos[4]) - function = infos[8] - for i in range(9, len(infos)): - function = function + " " + infos[i] - #print("percent: %.2f; gpu_time: %.4f ms; calls: %d; function: %s" % - # (percent, gpu_time, calls, function)) - total_gpu_time = gpu_time / percent - print("total gpu_time: %.4f ms" % total_gpu_time) - print("") - return total_gpu_time + def _parse_line(self, line, begin_pos=0): + infos = line.strip().split() + percent = float(infos[begin_pos].replace("%", "")) * 0.01 + gpu_time = self._to_millisecond(infos[begin_pos + 1]) + calls = int(infos[begin_pos + 2]) + function = infos[begin_pos + 6] + for i in range(begin_pos + 7, len(infos)): + function = function + " " + infos[i] + return gpu_time, percent, function class NsightRunner(object): From 685bf7077a739882568d49a32b2086890b592ae8 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Tue, 18 Jan 2022 02:51:32 +0000 Subject: [PATCH 08/12] Add fused_batch_norm_relu scripts. --- api/run_op_benchmark.sh | 19 +++--- api/tests_v2/batch_norm.py | 4 +- api/tests_v2/fused_batch_norm_relu.py | 89 +++++++++++++++++++++++++++ 3 files changed, 101 insertions(+), 11 deletions(-) create mode 100644 api/tests_v2/fused_batch_norm_relu.py diff --git a/api/run_op_benchmark.sh b/api/run_op_benchmark.sh index 6208c01a2f..a7fd20948f 100755 --- a/api/run_op_benchmark.sh +++ b/api/run_op_benchmark.sh @@ -4,7 +4,8 @@ OP_BENCHMARK_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")" && pwd )" test_module_name=${1:-"dynamic_tests_v2"} # "tests_v2", "dynamic_tests_v2" gpu_ids=${2:-"0"} -op_type=${3:-"all"} # "all" or specified op_type, such as elementwise +model_name_op_type=${3:-"all"} # "all" or specified model_name/op_type, such as elementwise +config_subdir=${4:-"op_configs"} if [ ${test_module_name} != "tests_v2" ] && [ ${test_module_name} != "dynamic_tests_v2" ]; then echo "Please set test_module_name (${test_module_name}) to \"tests_v2\" or \"dynamic_tests_v2\"!" @@ -63,10 +64,10 @@ run_op_benchmark() { bash ${OP_BENCHMARK_ROOT}/deploy/main_control.sh ${tests_dir} ${config_dir} ${output_dir} ${gpu_ids} "both" "both" "none" "both" "${testing_mode}" > ${log_path} 2>&1 & } -run_specified_op() { +run_specified_task() { local testing_mode=$1 - OUTPUT_ROOT=${OP_BENCHMARK_ROOT}/logs/${op_type} + OUTPUT_ROOT=${OP_BENCHMARK_ROOT}/logs/${model_name_op_type} if [ ! -d ${OUTPUT_ROOT} ]; then mkdir -p ${OUTPUT_ROOT} fi @@ -79,11 +80,11 @@ run_specified_op() { echo "-- output_dir: ${output_dir}" if [ "${test_module_name}" == "tests" ]; then - config_dir=${OP_BENCHMARK_ROOT}/tests/op_configs - op_list=${OUTPUT_ROOT}/api_info_${op_type}.txt + config_dir=${OP_BENCHMARK_ROOT}/tests/${config_subdir} + op_list=${OUTPUT_ROOT}/api_info_${model_name_op_type}.txt else - config_dir=${OP_BENCHMARK_ROOT}/tests_v2/op_configs - op_list=${OUTPUT_ROOT}/api_info_v2_${op_type}.txt + config_dir=${OP_BENCHMARK_ROOT}/tests_v2/${config_subdir} + op_list=${OUTPUT_ROOT}/api_info_v2_${model_name_op_type}.txt fi echo "-- config_dir: ${config_dir}" @@ -104,12 +105,12 @@ main() { install_package "tensorflow" "2.7.0" fi - case ${op_type} in + case ${model_name_op_type} in all) run_op_benchmark ${testing_mode} ;; *) - run_specified_op ${testing_mode} + run_specified_task ${testing_mode} ;; esac } diff --git a/api/tests_v2/batch_norm.py b/api/tests_v2/batch_norm.py index ebaa7796bb..43b250a69a 100644 --- a/api/tests_v2/batch_norm.py +++ b/api/tests_v2/batch_norm.py @@ -16,8 +16,8 @@ class BatchNormConfig(APIConfig): - def __init__(self): - super(BatchNormConfig, self).__init__('batch_norm') + def __init__(self, op_type="batch_norm"): + super(BatchNormConfig, self).__init__(op_type) def init_from_json(self, filename, config_id=0, unknown_dim=16): super(BatchNormConfig, self).init_from_json(filename, config_id, diff --git a/api/tests_v2/fused_batch_norm_relu.py b/api/tests_v2/fused_batch_norm_relu.py new file mode 100644 index 0000000000..326b972caf --- /dev/null +++ b/api/tests_v2/fused_batch_norm_relu.py @@ -0,0 +1,89 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from common_import import * +from batch_norm import BatchNormConfig + + +class FusedBatchNormReluConfig(BatchNormConfig): + def __init__(self): + super(FusedBatchNormReluConfig, self).__init__("fused_batch_norm_relu") + self.alias_name = "batch_norm" + + +class PDFusedBatchNormRelu(PaddleAPIBenchmarkBase): + def build_program(self, config): + def _create_parameter(name, value, stop_gradient): + param = paddle.create_parameter( + name=name, + shape=[config.num_channels], + dtype=config.x_dtype, + attr=paddle.ParamAttr( + initializer=paddle.nn.initializer.Constant(value))) + param.stop_gradient = stop_gradient + return param + + x = self.variable(name='x', shape=config.x_shape, dtype=config.x_dtype) + + running_mean = _create_parameter( + name='running_mean', value=0.5, stop_gradient=True) + running_var = _create_parameter( + name='running_var', value=0.1, stop_gradient=True) + + scale = _create_parameter(name='scale', value=0.5, stop_gradient=False) + bias = _create_parameter(name='bias', value=0.1, stop_gradient=False) + + bn_out = paddle.nn.functional.batch_norm( + x=x, + running_mean=running_mean, + running_var=running_var, + weight=scale, + bias=bias, + epsilon=config.epsilon, + momentum=config.momentum, + training=config.training, + data_format=config.data_format) + relu_out = paddle.nn.functional.relu(bn_out) + + self.feed_vars = [x] + self.fetch_vars = [bn_out, relu_out] + if config.backward: + self.append_gradients(relu_out, [x, scale, bias, bn_out]) + + +class TFFusedBatchNormRelu(TensorflowAPIBenchmarkBase): + def build_graph(self, config): + x = self.variable(name='x', shape=config.x_shape, dtype=config.x_dtype) + bn = tf.keras.layers.BatchNormalization( + axis=config.axis, + momentum=config.momentum, + epsilon=config.epsilon, + beta_initializer=tf.constant_initializer(0.1), + gamma_initializer=tf.constant_initializer(0.5), + moving_mean_initializer=tf.constant_initializer(0.5), + moving_variance_initializer=tf.constant_initializer(0.1)) + bn_out = bn(x, training=config.training) + relu_out = tf.nn.relu(bn_out) + + self.feed_list = [x] + self.fetch_list = [bn_out, relu_out] + if config.backward: + self.append_gradients(relu_out, [x, bn.gamma, bn.beta, bn_out]) + + +if __name__ == '__main__': + test_main( + PDFusedBatchNormRelu(), + TFFusedBatchNormRelu(), + config=FusedBatchNormReluConfig()) From 4438014a5775b0f222e9168617faec10f8e3e671 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Wed, 19 Jan 2022 06:49:32 +0000 Subject: [PATCH 09/12] Does not test the fused_xxx ops in ci. --- api/deploy/collect_api_info.py | 10 ++++++++-- ci/scripts/run_test.sh | 1 + 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/api/deploy/collect_api_info.py b/api/deploy/collect_api_info.py index ee13bcd606..fa935d2d8f 100644 --- a/api/deploy/collect_api_info.py +++ b/api/deploy/collect_api_info.py @@ -50,8 +50,14 @@ def collect_subclass_dict(test_cases_dict): def import_all_tests(test_module_name): + def _is_special_module(api_name): + special_module_list = ["__init__", "common_import", "fused_"] + for name in special_module_list: + if name in api_name: + return True + return False + test_cases_dict = {} - special_module_list = ["__init__", "common_import"] def _import_api(test_module_name, basename): try: @@ -66,7 +72,7 @@ def _import_api(test_module_name, basename): for filename in sorted(os.listdir(tests_path)): api_name = os.path.splitext(filename)[0] file_extension = os.path.splitext(filename)[1] - if file_extension == '.py' and api_name not in special_module_list: + if file_extension == '.py' and _is_special_module(api_name): module = _import_api(test_module_name, api_name) if module: test_cases_dict[api_name] = module diff --git a/ci/scripts/run_test.sh b/ci/scripts/run_test.sh index 8975ecb98b..3c0f288b04 100644 --- a/ci/scripts/run_test.sh +++ b/ci/scripts/run_test.sh @@ -74,6 +74,7 @@ function run_api(){ LOG "[INFO] Found ${file} modified." api=${file#*api/} && api=${api%.*} [[ "$api" =~ "common_import" ]] && continue + [[ "$api" =~ "fused_" ]] && continue [ -f "${BENCHMARK_ROOT}/api/${api}.py" ] && API_NAMES[${#API_NAMES[@]}]=$api if [[ "$file" =~ ".json" ]] then From f477069b6ebbf494e43bd933c480b575aa4a8368 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Fri, 21 Jan 2022 06:32:40 +0000 Subject: [PATCH 10/12] Add fused_batch_norm_add_relu and fix a bug. --- api/deploy/collect_api_info.py | 4 +- api/tests_v2/fused_batch_norm_add_relu.py | 95 +++++++++++++++++++++++ 2 files changed, 97 insertions(+), 2 deletions(-) create mode 100644 api/tests_v2/fused_batch_norm_add_relu.py diff --git a/api/deploy/collect_api_info.py b/api/deploy/collect_api_info.py index fa935d2d8f..b0b91163df 100644 --- a/api/deploy/collect_api_info.py +++ b/api/deploy/collect_api_info.py @@ -72,7 +72,7 @@ def _import_api(test_module_name, basename): for filename in sorted(os.listdir(tests_path)): api_name = os.path.splitext(filename)[0] file_extension = os.path.splitext(filename)[1] - if file_extension == '.py' and _is_special_module(api_name): + if file_extension == '.py' and not _is_special_module(api_name): module = _import_api(test_module_name, api_name) if module: test_cases_dict[api_name] = module @@ -138,7 +138,7 @@ def main(args): parser.add_argument( '--test_module_name', type=str, - default="tests", + default="tests_v2", help='The module_name under benchmark/api (tests|tests_v2|dynamic_tests_v2).' ) parser.add_argument( diff --git a/api/tests_v2/fused_batch_norm_add_relu.py b/api/tests_v2/fused_batch_norm_add_relu.py new file mode 100644 index 0000000000..7ba239c613 --- /dev/null +++ b/api/tests_v2/fused_batch_norm_add_relu.py @@ -0,0 +1,95 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from common_import import * +from batch_norm import BatchNormConfig + + +class FusedBatchNormAddReluConfig(BatchNormConfig): + def __init__(self): + super(FusedBatchNormAddReluConfig, + self).__init__("fused_batch_norm_add_relu") + self.alias_name = "batch_norm" + + +class PDFusedBatchNormAddRelu(PaddleAPIBenchmarkBase): + def build_program(self, config): + def _create_parameter(name, value, stop_gradient): + param = paddle.create_parameter( + name=name, + shape=[config.num_channels], + dtype=config.x_dtype, + attr=paddle.ParamAttr( + initializer=paddle.nn.initializer.Constant(value))) + param.stop_gradient = stop_gradient + return param + + x = self.variable(name='x', shape=config.x_shape, dtype=config.x_dtype) + y = self.variable(name='y', shape=config.x_shape, dtype=config.x_dtype) + + running_mean = _create_parameter( + name='running_mean', value=0.5, stop_gradient=True) + running_var = _create_parameter( + name='running_var', value=0.1, stop_gradient=True) + + scale = _create_parameter(name='scale', value=0.5, stop_gradient=False) + bias = _create_parameter(name='bias', value=0.1, stop_gradient=False) + + bn_out = paddle.nn.functional.batch_norm( + x=x, + running_mean=running_mean, + running_var=running_var, + weight=scale, + bias=bias, + epsilon=config.epsilon, + momentum=config.momentum, + training=config.training, + data_format=config.data_format) + add_out = bn_out + y + relu_out = paddle.nn.functional.relu(add_out) + + self.feed_vars = [x, y] + self.fetch_vars = [bn_out, add_out, relu_out] + if config.backward: + self.append_gradients(relu_out, [x, scale, bias, bn_out, add_out]) + + +class TFFusedBatchNormAddRelu(TensorflowAPIBenchmarkBase): + def build_graph(self, config): + x = self.variable(name='x', shape=config.x_shape, dtype=config.x_dtype) + y = self.variable(name='y', shape=config.x_shape, dtype=config.x_dtype) + bn = tf.keras.layers.BatchNormalization( + axis=config.axis, + momentum=config.momentum, + epsilon=config.epsilon, + beta_initializer=tf.constant_initializer(0.1), + gamma_initializer=tf.constant_initializer(0.5), + moving_mean_initializer=tf.constant_initializer(0.5), + moving_variance_initializer=tf.constant_initializer(0.1)) + bn_out = bn(x, training=config.training) + add_out = bn_out + y + relu_out = tf.nn.relu(add_out) + + self.feed_list = [x, y] + self.fetch_list = [bn_out, add_out, relu_out] + if config.backward: + self.append_gradients(relu_out, + [x, bn.gamma, bn.beta, bn_out, add_out]) + + +if __name__ == '__main__': + test_main( + PDFusedBatchNormAddRelu(), + TFFusedBatchNormAddRelu(), + config=FusedBatchNormAddReluConfig()) From ec29e7262331c8e3b06cc07aa607af24b146d6d7 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Mon, 21 Feb 2022 02:37:48 +0000 Subject: [PATCH 11/12] Update tf'version to 2.8.0. --- api/run_op_benchmark.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/run_op_benchmark.sh b/api/run_op_benchmark.sh index a7fd20948f..156a92a65f 100755 --- a/api/run_op_benchmark.sh +++ b/api/run_op_benchmark.sh @@ -102,7 +102,7 @@ main() { install_package "torch" "1.10.0" else testing_mode="static" - install_package "tensorflow" "2.7.0" + install_package "tensorflow" "2.8.0" fi case ${model_name_op_type} in From 6678cfeef560d8b0b34d285059a2d3d3722131e5 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Wed, 2 Mar 2022 09:09:18 +0000 Subject: [PATCH 12/12] Change copyright. --- api/common/env.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/common/env.py b/api/common/env.py index 00d82a33ba..ec84f19785 100644 --- a/api/common/env.py +++ b/api/common/env.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License.