diff --git a/api/common/env.py b/api/common/env.py new file mode 100644 index 0000000000..ec84f19785 --- /dev/null +++ b/api/common/env.py @@ -0,0 +1,23 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + + +def benchmark_need_feed(): + return os.environ.get("BENCHMARK_NEED_FEED", False) + + +def benchmark_need_fetch(): + return os.environ.get("BENCHMARK_NEED_FETCH", False) diff --git a/api/common/launch.py b/api/common/launch.py index 284fc6939f..eb89e25a3f 100644 --- a/api/common/launch.py +++ b/api/common/launch.py @@ -18,6 +18,7 @@ import sys import argparse +from common import env from common import system from common import api_param @@ -33,9 +34,63 @@ def is_ampere_gpu(): return False +class TimeUnit(object): + def __init__(self): + self.kernel_time = 0.0 + self.memory_time = 0.0 + self.memcpy_h2d = 0.0 + self.memcpy_d2h = 0.0 + self.memcpy_d2d = 0.0 + self.memset = 0.0 + + def total(self): + self.memory_time = self.memcpy_h2d + self.memcpy_d2d + self.memset + if not env.benchmark_need_fetch(): + # Normally DtoH is fetching results. + self.memory_time += self.memcpy_d2h + return self.kernel_time + self.memory_time + + def __str__(self): + total_time = self.total() + if env.benchmark_need_fetch(): + infostr = "total gpu_time (exclude DtoH): {:.4f} ms ".format( + total_time) + else: + infostr = "total gpu_time: {:.4f} ms ".format(total_time) + if total_time > 0.0: + infostr += "(kernel: {:.4f} ms ({:.2f}%); memory: {:.4f} ms ({:.2f}%))".format( + self.kernel_time, self.kernel_time * 100 / total_time, + self.memory_time, self.memory_time * 100 / total_time) + else: + infostr += "(kernel: {:.4f} ms; memory: {:.4f} ms)".format( + self.kernel_time, self.memory_time) + infostr += "\n" + return infostr + + def add_info(self, time, name): + if name == "[CUDA memcpy HtoD]": + self._update_memory_time("memcpy_h2d", time) + elif name == "[CUDA memcpy DtoH]": + self._update_memory_time("memcpy_d2h", time) + elif name == "[CUDA memcpy DtoD]": + self._update_memory_time("memcpy_d2d", time) + elif name == "[CUDA memset]": + self._update_memory_time("memset", time) + else: + self.kernel_time += time + + def _update_memory_time(self, member_name, time): + assert member_name in [ + "memcpy_h2d", "memcpy_d2h", "memcpy_d2d", "memset" + ] + setattr(self, member_name, time) + if member_name != "memcpy_d2h" or not env.benchmark_need_fetch(): + self.memory_time += time + + class NvprofRunner(object): - def run(self, cmd): - stdout, exit_code = self._nvprof(cmd) + def run(self, cmd, profile_from_start=False): + stdout, exit_code = self._nvprof(cmd, profile_from_start) if exit_code == 0: parse_status, gpu_time = self._parse_logs(stdout.split("\n")) if parse_status: @@ -43,9 +98,12 @@ def run(self, cmd): print("Running Error:\n {}".format(stdout)) return 0.0 - def _nvprof(self, cmd): - return system.run_command("nvprof --profile-from-start off {}".format( - cmd)) + def _nvprof(self, cmd, profile_from_start): + if profile_from_start: + profile_cmd = "nvprof {}".format(cmd) + else: + profile_cmd = "nvprof --profile-from-start off {}".format(cmd) + return system.run_command(profile_cmd) def _parse_logs(self, logs): line_from = None @@ -58,41 +116,44 @@ def _parse_logs(self, logs): line_to = i break if line_from is not None and line_to is not None: + time_unit = TimeUnit() for i in range(line_from, line_to): print(logs[i]) + if i >= line_from + 1: + begin_pos = 2 if i == line_from + 1 else 0 + gpu_time, percent, function = self._parse_line(logs[i], + begin_pos) + time_unit.add_info(gpu_time, function) print("") - return True, self._parse_gpu_time(logs[line_from + 1]) + print(time_unit) + return True, time_unit.total() else: return False, 0.0 - def _parse_gpu_time(self, line): - infos = line.strip().split() - percent = float(infos[2].replace("%", "")) * 0.01 - gpu_time = infos[3] - if gpu_time.endswith("us"): - gpu_time = float(gpu_time.replace("us", "")) * 0.001 - elif gpu_time.endswith("ms"): - gpu_time = float(gpu_time.replace("ms", "")) - elif gpu_time.endswith("s"): - gpu_time = float(gpu_time.replace("s", "")) * 1000 + def _to_millisecond(self, timestr): + if timestr.endswith("us"): + return float(timestr.replace("us", "")) * 0.001 + elif timestr.endswith("ms"): + return float(timestr.replace("ms", "")) + elif timestr.endswith("s"): + return float(timestr.replace("s", "")) * 1000 else: raise ValueError("Invalid time: %s" % gpu_time) - calls = int(infos[4]) - function = infos[8] - for i in range(9, len(infos)): - function = function + " " + infos[i] - #print("percent: %.2f; gpu_time: %.4f ms; calls: %d; function: %s" % - # (percent, gpu_time, calls, function)) - total_gpu_time = gpu_time / percent - print("total gpu_time: %.4f ms" % total_gpu_time) - print("") - return total_gpu_time + def _parse_line(self, line, begin_pos=0): + infos = line.strip().split() + percent = float(infos[begin_pos].replace("%", "")) * 0.01 + gpu_time = self._to_millisecond(infos[begin_pos + 1]) + calls = int(infos[begin_pos + 2]) + function = infos[begin_pos + 6] + for i in range(begin_pos + 7, len(infos)): + function = function + " " + infos[i] + return gpu_time, percent, function class NsightRunner(object): - def run(self, cmd): - stdout, exit_code = self._nsight(cmd) + def run(self, cmd, profile_from_start=False): + stdout, exit_code = self._nsight(cmd, profile_from_start) if exit_code == 0: parse_status, gpu_time = self._parse_logs(stdout.split("\n")) if parse_status: @@ -100,9 +161,13 @@ def run(self, cmd): print("Running Error:\n {}".format(stdout)) return 0.0 - def _nsight(self, cmd): - return system.run_command( - "nsys nvprof --profile-from-start=off -o tmp.qdrep {}".format(cmd)) + def _nsight(self, cmd, profile_from_start): + if profile_from_start: + profile_cmd = "nsys nvprof -o tmp.qdrep {}".format(cmd) + else: + profile_cmd = "nsys nvprof --profile-from-start=off -o tmp.qdrep {}".format( + cmd) + return system.run_command(profile_cmd) def _parse_logs(self, logs): kernel_line_from = None @@ -362,7 +427,8 @@ def launch(benchmark_script, task="speed", repeat=1, sync_interval=80, - with_nvprof=False): + with_nvprof=False, + profile_from_start=True): """ If with_nvprof is True, it will launch the following command firstly to get the gpu_time: @@ -371,7 +437,8 @@ def launch(benchmark_script, Then the normal testing command will be launched: python benchmark_script benchmark_script_args """ - if with_nvprof: + + if with_nvprof and not profile_from_start: if task == "speed": _set_args(benchmark_script_args, "--profiler", "nvprof") elif task == "scheduling": @@ -384,7 +451,7 @@ def launch(benchmark_script, runner = NsightRunner() else: runner = NvprofRunner() - gpu_time = runner.run(cmd) + gpu_time = runner.run(cmd, profile_from_start) _set_args(benchmark_script_args, "--profiler", "none") return gpu_time elif task == "scheduling": @@ -438,6 +505,7 @@ def _set_args(args, arg, value): args = parser.parse_args() benchmark_args_dict = _args_list_to_dict(args.benchmark_script_args) task = benchmark_args_dict.get("task", "speed") + framework = benchmark_args_dict.get("framework", "paddle") use_gpu = system.str2bool(benchmark_args_dict.get( "use_gpu", "False")) and os.environ.get("CUDA_VISIBLE_DEVICES", None) != "" @@ -448,13 +516,15 @@ def _set_args(args, arg, value): system.check_commit() if use_gpu and task in ["speed", "scheduling"] and profiler == "none": + profile_from_start = False output_time = launch( args.benchmark_script, args.benchmark_script_args, task, repeat, sync_interval, - with_nvprof=True) + with_nvprof=True, + profile_from_start=profile_from_start) if task == "speed": args.benchmark_script_args.append(" --gpu_time ") args.benchmark_script_args.append(str(output_time)) diff --git a/api/common/special_op_list.py b/api/common/special_op_list.py index 32753ab1e1..4bdf713bb0 100644 --- a/api/common/special_op_list.py +++ b/api/common/special_op_list.py @@ -36,7 +36,6 @@ "arange", "argmax", "argmin", - "argsort", "assign", "cast", "clip_by_norm", diff --git a/api/common/tensorflow_op_benchmark.py b/api/common/tensorflow_op_benchmark.py index b50f842d75..8036a5f617 100644 --- a/api/common/tensorflow_op_benchmark.py +++ b/api/common/tensorflow_op_benchmark.py @@ -21,12 +21,14 @@ from common import special_op_list from common.benchmark import BenchmarkBase +from . import env from . import utils from . import api_param from . import feeder try: import tensorflow as tf + from tensorflow.python.profiler import model_analyzer from tensorflow.python.profiler import option_builder from tensorflow.core.protobuf import config_pb2 @@ -55,9 +57,9 @@ def __enter__(self): import cProfile self._profiler_handle = cProfile.Profile() self._profiler_handle.enable() - elif self.profiler != "none": + elif self.profiler == "native": self._profiler_handle = model_analyzer.Profiler( - graph=self.sess.graph) + graph=self._sess.graph) self.run_options = tf.compat.v1.RunOptions( trace_level=tf.compat.v1.RunOptions.FULL_TRACE) self.run_metadata = tf.compat.v1.RunMetadata() @@ -247,8 +249,10 @@ def generate_random_feeder(self, assert use_feed_fetch, "Argument use_feed_fetch must be True when feeder_adapter is initialized by paddle." if feeder_adapter is None or feeder_adapter.framework != "tensorflow": - self._need_feed = config.name == "feed" - self._need_fetch = use_feed_fetch or config.name == "fetch" + self._need_feed = env.benchmark_need_feed( + ) or config.name == "feed" + self._need_fetch = env.benchmark_need_fetch( + ) or use_feed_fetch or config.name == "fetch" self._feed_spec = feeder.copy_feed_spec(config.feed_spec) self._feed_dict = {} @@ -294,12 +298,14 @@ def run(self, config, args, use_feed_fetch=True, feeder_adapter=None): self.fetch_list = fetch_list self.allow_growth = False if args.task == "speed" else True - outputs, stats = self.run_impl( - use_gpu=args.use_gpu, - config=config, - feed=feed, - repeat=args.repeat, - profiler=args.profiler) + device = "GPU:0" if args.use_gpu else "CPU" + with tf.device(device): + outputs, stats = self.run_impl( + use_gpu=args.use_gpu, + config=config, + feed=feed, + repeat=args.repeat, + profiler=args.profiler) return outputs, stats def _init_session(self, use_gpu): diff --git a/api/deploy/collect_api_info.py b/api/deploy/collect_api_info.py index eff774a8a0..40299d9b6d 100644 --- a/api/deploy/collect_api_info.py +++ b/api/deploy/collect_api_info.py @@ -51,8 +51,16 @@ def collect_subclass_dict(test_cases_dict): def import_all_tests(test_module_name): + def _is_special_module(api_name): + special_module_list = [ + "__init__", "common_import", "test_main", "fused_" + ] + for name in special_module_list: + if name in api_name: + return True + return False + test_cases_dict = {} - special_module_list = ["__init__", "common_import", "test_main"] def _import_api(test_module_name, basename): try: @@ -68,7 +76,7 @@ def _import_api(test_module_name, basename): for filename in sorted(os.listdir(tests_path)): api_name = os.path.splitext(filename)[0] file_extension = os.path.splitext(filename)[1] - if file_extension == '.py' and api_name not in special_module_list: + if file_extension == '.py' and not _is_special_module(api_name): module = _import_api(test_module_name, api_name) if module: test_cases_dict[api_name] = module @@ -134,7 +142,7 @@ def main(args): parser.add_argument( '--test_module_name', type=str, - default="tests", + default="tests_v2", help='The module_name under benchmark/api (tests|tests_v2|dynamic_tests_v2).' ) parser.add_argument( diff --git a/api/run_op_benchmark.sh b/api/run_op_benchmark.sh index f717998a78..156a92a65f 100755 --- a/api/run_op_benchmark.sh +++ b/api/run_op_benchmark.sh @@ -4,7 +4,8 @@ OP_BENCHMARK_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")" && pwd )" test_module_name=${1:-"dynamic_tests_v2"} # "tests_v2", "dynamic_tests_v2" gpu_ids=${2:-"0"} -op_type=${3:-"all"} # "all" or specified op_type, such as elementwise +model_name_op_type=${3:-"all"} # "all" or specified model_name/op_type, such as elementwise +config_subdir=${4:-"op_configs"} if [ ${test_module_name} != "tests_v2" ] && [ ${test_module_name} != "dynamic_tests_v2" ]; then echo "Please set test_module_name (${test_module_name}) to \"tests_v2\" or \"dynamic_tests_v2\"!" @@ -63,10 +64,10 @@ run_op_benchmark() { bash ${OP_BENCHMARK_ROOT}/deploy/main_control.sh ${tests_dir} ${config_dir} ${output_dir} ${gpu_ids} "both" "both" "none" "both" "${testing_mode}" > ${log_path} 2>&1 & } -run_specified_op() { +run_specified_task() { local testing_mode=$1 - OUTPUT_ROOT=${OP_BENCHMARK_ROOT}/logs/${op_type} + OUTPUT_ROOT=${OP_BENCHMARK_ROOT}/logs/${model_name_op_type} if [ ! -d ${OUTPUT_ROOT} ]; then mkdir -p ${OUTPUT_ROOT} fi @@ -79,11 +80,11 @@ run_specified_op() { echo "-- output_dir: ${output_dir}" if [ "${test_module_name}" == "tests" ]; then - config_dir=${OP_BENCHMARK_ROOT}/tests/op_configs - op_list=${OUTPUT_ROOT}/api_info_${op_type}.txt + config_dir=${OP_BENCHMARK_ROOT}/tests/${config_subdir} + op_list=${OUTPUT_ROOT}/api_info_${model_name_op_type}.txt else - config_dir=${OP_BENCHMARK_ROOT}/tests_v2/op_configs - op_list=${OUTPUT_ROOT}/api_info_v2_${op_type}.txt + config_dir=${OP_BENCHMARK_ROOT}/tests_v2/${config_subdir} + op_list=${OUTPUT_ROOT}/api_info_v2_${model_name_op_type}.txt fi echo "-- config_dir: ${config_dir}" @@ -101,15 +102,15 @@ main() { install_package "torch" "1.10.0" else testing_mode="static" - install_package "tensorflow" "2.3.1" + install_package "tensorflow" "2.8.0" fi - case ${op_type} in + case ${model_name_op_type} in all) run_op_benchmark ${testing_mode} ;; *) - run_specified_op ${testing_mode} + run_specified_task ${testing_mode} ;; esac } diff --git a/api/tests_v2/batch_norm.py b/api/tests_v2/batch_norm.py index e74978a91e..9350d6aa0a 100644 --- a/api/tests_v2/batch_norm.py +++ b/api/tests_v2/batch_norm.py @@ -16,8 +16,8 @@ class BatchNormConfig(APIConfig): - def __init__(self): - super(BatchNormConfig, self).__init__('batch_norm') + def __init__(self, op_type="batch_norm"): + super(BatchNormConfig, self).__init__(op_type) def init_from_json(self, filename, config_id=0, unknown_dim=16): super(BatchNormConfig, self).init_from_json(filename, config_id, diff --git a/api/tests_v2/fused_batch_norm_add_relu.py b/api/tests_v2/fused_batch_norm_add_relu.py new file mode 100644 index 0000000000..7ba239c613 --- /dev/null +++ b/api/tests_v2/fused_batch_norm_add_relu.py @@ -0,0 +1,95 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from common_import import * +from batch_norm import BatchNormConfig + + +class FusedBatchNormAddReluConfig(BatchNormConfig): + def __init__(self): + super(FusedBatchNormAddReluConfig, + self).__init__("fused_batch_norm_add_relu") + self.alias_name = "batch_norm" + + +class PDFusedBatchNormAddRelu(PaddleAPIBenchmarkBase): + def build_program(self, config): + def _create_parameter(name, value, stop_gradient): + param = paddle.create_parameter( + name=name, + shape=[config.num_channels], + dtype=config.x_dtype, + attr=paddle.ParamAttr( + initializer=paddle.nn.initializer.Constant(value))) + param.stop_gradient = stop_gradient + return param + + x = self.variable(name='x', shape=config.x_shape, dtype=config.x_dtype) + y = self.variable(name='y', shape=config.x_shape, dtype=config.x_dtype) + + running_mean = _create_parameter( + name='running_mean', value=0.5, stop_gradient=True) + running_var = _create_parameter( + name='running_var', value=0.1, stop_gradient=True) + + scale = _create_parameter(name='scale', value=0.5, stop_gradient=False) + bias = _create_parameter(name='bias', value=0.1, stop_gradient=False) + + bn_out = paddle.nn.functional.batch_norm( + x=x, + running_mean=running_mean, + running_var=running_var, + weight=scale, + bias=bias, + epsilon=config.epsilon, + momentum=config.momentum, + training=config.training, + data_format=config.data_format) + add_out = bn_out + y + relu_out = paddle.nn.functional.relu(add_out) + + self.feed_vars = [x, y] + self.fetch_vars = [bn_out, add_out, relu_out] + if config.backward: + self.append_gradients(relu_out, [x, scale, bias, bn_out, add_out]) + + +class TFFusedBatchNormAddRelu(TensorflowAPIBenchmarkBase): + def build_graph(self, config): + x = self.variable(name='x', shape=config.x_shape, dtype=config.x_dtype) + y = self.variable(name='y', shape=config.x_shape, dtype=config.x_dtype) + bn = tf.keras.layers.BatchNormalization( + axis=config.axis, + momentum=config.momentum, + epsilon=config.epsilon, + beta_initializer=tf.constant_initializer(0.1), + gamma_initializer=tf.constant_initializer(0.5), + moving_mean_initializer=tf.constant_initializer(0.5), + moving_variance_initializer=tf.constant_initializer(0.1)) + bn_out = bn(x, training=config.training) + add_out = bn_out + y + relu_out = tf.nn.relu(add_out) + + self.feed_list = [x, y] + self.fetch_list = [bn_out, add_out, relu_out] + if config.backward: + self.append_gradients(relu_out, + [x, bn.gamma, bn.beta, bn_out, add_out]) + + +if __name__ == '__main__': + test_main( + PDFusedBatchNormAddRelu(), + TFFusedBatchNormAddRelu(), + config=FusedBatchNormAddReluConfig()) diff --git a/api/tests_v2/fused_batch_norm_relu.py b/api/tests_v2/fused_batch_norm_relu.py new file mode 100644 index 0000000000..326b972caf --- /dev/null +++ b/api/tests_v2/fused_batch_norm_relu.py @@ -0,0 +1,89 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from common_import import * +from batch_norm import BatchNormConfig + + +class FusedBatchNormReluConfig(BatchNormConfig): + def __init__(self): + super(FusedBatchNormReluConfig, self).__init__("fused_batch_norm_relu") + self.alias_name = "batch_norm" + + +class PDFusedBatchNormRelu(PaddleAPIBenchmarkBase): + def build_program(self, config): + def _create_parameter(name, value, stop_gradient): + param = paddle.create_parameter( + name=name, + shape=[config.num_channels], + dtype=config.x_dtype, + attr=paddle.ParamAttr( + initializer=paddle.nn.initializer.Constant(value))) + param.stop_gradient = stop_gradient + return param + + x = self.variable(name='x', shape=config.x_shape, dtype=config.x_dtype) + + running_mean = _create_parameter( + name='running_mean', value=0.5, stop_gradient=True) + running_var = _create_parameter( + name='running_var', value=0.1, stop_gradient=True) + + scale = _create_parameter(name='scale', value=0.5, stop_gradient=False) + bias = _create_parameter(name='bias', value=0.1, stop_gradient=False) + + bn_out = paddle.nn.functional.batch_norm( + x=x, + running_mean=running_mean, + running_var=running_var, + weight=scale, + bias=bias, + epsilon=config.epsilon, + momentum=config.momentum, + training=config.training, + data_format=config.data_format) + relu_out = paddle.nn.functional.relu(bn_out) + + self.feed_vars = [x] + self.fetch_vars = [bn_out, relu_out] + if config.backward: + self.append_gradients(relu_out, [x, scale, bias, bn_out]) + + +class TFFusedBatchNormRelu(TensorflowAPIBenchmarkBase): + def build_graph(self, config): + x = self.variable(name='x', shape=config.x_shape, dtype=config.x_dtype) + bn = tf.keras.layers.BatchNormalization( + axis=config.axis, + momentum=config.momentum, + epsilon=config.epsilon, + beta_initializer=tf.constant_initializer(0.1), + gamma_initializer=tf.constant_initializer(0.5), + moving_mean_initializer=tf.constant_initializer(0.5), + moving_variance_initializer=tf.constant_initializer(0.1)) + bn_out = bn(x, training=config.training) + relu_out = tf.nn.relu(bn_out) + + self.feed_list = [x] + self.fetch_list = [bn_out, relu_out] + if config.backward: + self.append_gradients(relu_out, [x, bn.gamma, bn.beta, bn_out]) + + +if __name__ == '__main__': + test_main( + PDFusedBatchNormRelu(), + TFFusedBatchNormRelu(), + config=FusedBatchNormReluConfig()) diff --git a/ci/scripts/run_test.sh b/ci/scripts/run_test.sh index 9518d6f160..ceb05b8a3a 100644 --- a/ci/scripts/run_test.sh +++ b/ci/scripts/run_test.sh @@ -74,6 +74,7 @@ function run_api(){ LOG "[INFO] Found ${file} modified." api=${file#*api/} && api=${api%.*} [[ "$api" =~ "common_import" ]] && continue + [[ "$api" =~ "fused_" ]] && continue [[ "$api" =~ "test_main" ]] && continue [[ "$api" =~ "__init__" ]] && continue [ -f "${BENCHMARK_ROOT}/api/${api}.py" ] && API_NAMES[${#API_NAMES[@]}]=$api