From 0266394d50a1f495afdfb40b4c2e2622c24502d9 Mon Sep 17 00:00:00 2001 From: Christian Sigg Date: Mon, 8 Jun 2020 22:16:33 -0700 Subject: [PATCH] Compress CUDA kernel binaries (CUBINs). Impact: -32MB wheel size -600MB _pywrap_tensorflow_internal.so -70MB memory during startup +120ms startup time PiperOrigin-RevId: 315419813 Change-Id: I2c39a88d95a4aa3a692560c8e3d78b125e8445c9 --- .../clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl | 5 ++++- .../crosstool/windows/msvc_wrapper_for_nvcc.py.tpl | 4 ++++ third_party/gpus/cuda/build_defs.bzl.tpl | 11 +++++++---- third_party/gpus/cuda_configure.bzl | 8 ++++---- .../windows/msvc_wrapper_for_nvcc.py | 4 ++++ .../clang/bin/crosstool_wrapper_driver_is_not_gcc | 5 ++++- 6 files changed, 27 insertions(+), 10 deletions(-) diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl index a48ef8bf35a..b35fec975da 100755 --- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl +++ b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl @@ -62,7 +62,7 @@ def GetOptionValue(argv, option): Args: argv: A list of strings, possibly the argv passed to main(). - option: The option whose value to extract, without the leading '-'. + option: The option whose value to extract, with the leading '-'. Returns: A list of values, either directly following the option, @@ -189,6 +189,8 @@ def InvokeNvcc(argv, log=False): nvcc_allowed_std_options = ["c++03", "c++11", "c++14"] std_options = ''.join([' -std=' + define for define in std_options if define in nvcc_allowed_std_options][-1:]) + fatbin_options = ''.join([' --fatbin-options=' + option + for option in GetOptionValue(argv, '-Xcuda-fatbinary')]) # The list of source files get passed after the -c option. I don't know of # any other reliable way to just get the list of source files to be compiled. @@ -233,6 +235,7 @@ def InvokeNvcc(argv, log=False): nvccopts += std_options nvccopts += m_options nvccopts += warning_options + nvccopts += fatbin_options if depfiles: # Generate the dependency file diff --git a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl index 73012876691..d2c9b917168 100644 --- a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl +++ b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl @@ -130,6 +130,9 @@ def InvokeNvcc(argv, log=False): undefines, argv = GetOptionValue(argv, '/U') undefines = ['-U' + define for define in undefines] + fatbin_options, argv = GetOptionValue(argv, '-Xcuda-fatbinary') + fatbin_options = ['--fatbin-options=' + option for option in fatbin_options] + # The rest of the unrecognized options should be passed to host compiler host_compiler_options = [option for option in argv if option not in (src_files + out_file)] @@ -154,6 +157,7 @@ def InvokeNvcc(argv, log=False): nvccopts += undefines nvccopts += defines nvccopts += m_options + nvccopts += fatbin_options nvccopts += ['--compiler-options="' + " ".join(host_compiler_options) + '"'] nvccopts += ['-x', 'cu'] + opt + includes + out + ['-c'] + src_files # Specify a unique temp directory for nvcc to generate intermediate files, diff --git a/third_party/gpus/cuda/build_defs.bzl.tpl b/third_party/gpus/cuda/build_defs.bzl.tpl index bba772e2377..d931a02f9b4 100644 --- a/third_party/gpus/cuda/build_defs.bzl.tpl +++ b/third_party/gpus/cuda/build_defs.bzl.tpl @@ -40,12 +40,15 @@ def if_cuda_clang_opt(if_true, if_false = []): def cuda_default_copts(): """Default options for all CUDA compilations.""" - return if_cuda( - ["-x", "cuda", "-DGOOGLE_CUDA=1"] - ) + if_cuda_clang_opt( + return if_cuda([ + "-x", "cuda", + "-DGOOGLE_CUDA=1", + "-Xcuda-fatbinary=--compress-all", + "--no-cuda-include-ptx=all" + ] + %{cuda_extra_copts}) + if_cuda_clang_opt( # Some important CUDA optimizations are only enabled at O3. ["-O3"] - ) + %{cuda_extra_copts} + ) def cuda_is_configured(): """Returns true if CUDA was enabled during the configure process.""" diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl index 90ce206db01..0b87ba1ae2a 100644 --- a/third_party/gpus/cuda_configure.bzl +++ b/third_party/gpus/cuda_configure.bzl @@ -905,14 +905,14 @@ def _tf_sysroot(repository_ctx): return get_host_environ(repository_ctx, _TF_SYSROOT, "") def _compute_cuda_extra_copts(repository_ctx, compute_capabilities): - capability_flags = ["--no-cuda-include-ptx=all"] + copts = [] for capability in compute_capabilities: if capability.startswith("compute_"): capability = capability.replace("compute_", "sm_") - capability_flags.append("--cuda-include-ptx=%s" % capability) - capability_flags.append("--cuda-gpu-arch=%s" % capability) + copts.append("--cuda-include-ptx=%s" % capability) + copts.append("--cuda-gpu-arch=%s" % capability) - return str(capability_flags) + return str(copts) def _tpl_path(repository_ctx, filename): return repository_ctx.path(Label("//third_party/gpus/%s.tpl" % filename)) diff --git a/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py index 9d5f7fb6118..9c0b8b6efd6 100755 --- a/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py +++ b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py @@ -130,6 +130,9 @@ def InvokeNvcc(argv, log=False): undefines, argv = GetOptionValue(argv, '/U') undefines = ['-U' + define for define in undefines] + fatbin_options, argv = GetOptionValue(argv, '-Xcuda-fatbinary') + fatbin_options = ['--fatbin-options=' + option for option in fatbin_options] + # The rest of the unrecognized options should be passed to host compiler host_compiler_options = [option for option in argv if option not in (src_files + out_file)] @@ -154,6 +157,7 @@ def InvokeNvcc(argv, log=False): nvccopts += undefines nvccopts += defines nvccopts += m_options + nvccopts += fatbin_options nvccopts += ['--compiler-options="' + " ".join(host_compiler_options) + '"'] nvccopts += ['-x', 'cu'] + opt + includes + out + ['-c'] + src_files # Specify a unique temp directory for nvcc to generate intermediate files, diff --git a/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1/clang/bin/crosstool_wrapper_driver_is_not_gcc b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1/clang/bin/crosstool_wrapper_driver_is_not_gcc index e427b01e9fa..d66945ab7c0 100755 --- a/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1/clang/bin/crosstool_wrapper_driver_is_not_gcc +++ b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1/clang/bin/crosstool_wrapper_driver_is_not_gcc @@ -62,7 +62,7 @@ def GetOptionValue(argv, option): Args: argv: A list of strings, possibly the argv passed to main(). - option: The option whose value to extract, without the leading '-'. + option: The option whose value to extract, with the leading '-'. Returns: A list of values, either directly following the option, @@ -189,6 +189,8 @@ def InvokeNvcc(argv, log=False): nvcc_allowed_std_options = ["c++03", "c++11", "c++14"] std_options = ''.join([' -std=' + define for define in std_options if define in nvcc_allowed_std_options][-1:]) + fatbin_options = ''.join([' --fatbin-options=' + option + for option in GetOptionValue(argv, '-Xcuda-fatbinary')]) # The list of source files get passed after the -c option. I don't know of # any other reliable way to just get the list of source files to be compiled. @@ -233,6 +235,7 @@ def InvokeNvcc(argv, log=False): nvccopts += std_options nvccopts += m_options nvccopts += warning_options + nvccopts += fatbin_options if depfiles: # Generate the dependency file