Compress CUDA kernel binaries (CUBINs).

Impact: -32MB wheel size -600MB _pywrap_tensorflow_internal.so -70MB memory during startup +120ms startup time PiperOrigin-RevId: 315419813 Change-Id: I2c39a88d95a4aa3a692560c8e3d78b125e8445c9
2020-06-08 22:16:33 -07:00 · 2020-06-08 22:16:33 -07:00 · 0266394d50
commit 0266394d50
parent 7b0071d954
6 changed files with 27 additions and 10 deletions
--- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
+++ b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
@ -62,7 +62,7 @@ def GetOptionValue(argv, option):

  Args:
    argv: A list of strings, possibly the argv passed to main().
-    option: The option whose value to extract, without the leading '-'.
+    option: The option whose value to extract, with the leading '-'.

  Returns:
    A list of values, either directly following the option,
@ -189,6 +189,8 @@ def InvokeNvcc(argv, log=False):
  nvcc_allowed_std_options = ["c++03", "c++11", "c++14"]
  std_options = ''.join([' -std=' + define
      for define in std_options if define in nvcc_allowed_std_options][-1:])
+  fatbin_options = ''.join([' --fatbin-options=' + option
+      for option in GetOptionValue(argv, '-Xcuda-fatbinary')])

  # The list of source files get passed after the -c option. I don't know of
  # any other reliable way to just get the list of source files to be compiled.
@ -233,6 +235,7 @@ def InvokeNvcc(argv, log=False):
  nvccopts += std_options
  nvccopts += m_options
  nvccopts += warning_options
+  nvccopts += fatbin_options

  if depfiles:
    # Generate the dependency file
--- a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
+++ b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
@ -130,6 +130,9 @@ def InvokeNvcc(argv, log=False):
  undefines, argv = GetOptionValue(argv, '/U')
  undefines = ['-U' + define for define in undefines]

+  fatbin_options, argv = GetOptionValue(argv, '-Xcuda-fatbinary')
+  fatbin_options = ['--fatbin-options=' + option for option in fatbin_options]
+
  # The rest of the unrecognized options should be passed to host compiler
  host_compiler_options = [option for option in argv if option not in (src_files + out_file)]

@ -154,6 +157,7 @@ def InvokeNvcc(argv, log=False):
  nvccopts += undefines
  nvccopts += defines
  nvccopts += m_options
+  nvccopts += fatbin_options
  nvccopts += ['--compiler-options="' + " ".join(host_compiler_options) + '"']
  nvccopts += ['-x', 'cu'] + opt + includes + out + ['-c'] + src_files
  # Specify a unique temp directory for nvcc to generate intermediate files,
--- a/third_party/gpus/cuda/build_defs.bzl.tpl
+++ b/third_party/gpus/cuda/build_defs.bzl.tpl
@ -40,12 +40,15 @@ def if_cuda_clang_opt(if_true, if_false = []):

 def cuda_default_copts():
    """Default options for all CUDA compilations."""
-    return if_cuda(
-        ["-x", "cuda", "-DGOOGLE_CUDA=1"]
-    ) + if_cuda_clang_opt(
+    return if_cuda([
+        "-x", "cuda",
+        "-DGOOGLE_CUDA=1",
+        "-Xcuda-fatbinary=--compress-all",
+        "--no-cuda-include-ptx=all"
+    ] + %{cuda_extra_copts}) + if_cuda_clang_opt(
        # Some important CUDA optimizations are only enabled at O3.
        ["-O3"]
-    ) + %{cuda_extra_copts}
+    )

 def cuda_is_configured():
    """Returns true if CUDA was enabled during the configure process."""
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@ -905,14 +905,14 @@ def _tf_sysroot(repository_ctx):
    return get_host_environ(repository_ctx, _TF_SYSROOT, "")

 def _compute_cuda_extra_copts(repository_ctx, compute_capabilities):
-    capability_flags = ["--no-cuda-include-ptx=all"]
+    copts = []
    for capability in compute_capabilities:
        if capability.startswith("compute_"):
            capability = capability.replace("compute_", "sm_")
-            capability_flags.append("--cuda-include-ptx=%s" % capability)
-        capability_flags.append("--cuda-gpu-arch=%s" % capability)
+            copts.append("--cuda-include-ptx=%s" % capability)
+        copts.append("--cuda-gpu-arch=%s" % capability)

-    return str(capability_flags)
+    return str(copts)

 def _tpl_path(repository_ctx, filename):
    return repository_ctx.path(Label("//third_party/gpus/%s.tpl" % filename))
--- a/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
+++ b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
@ -130,6 +130,9 @@ def InvokeNvcc(argv, log=False):
  undefines, argv = GetOptionValue(argv, '/U')
  undefines = ['-U' + define for define in undefines]

+  fatbin_options, argv = GetOptionValue(argv, '-Xcuda-fatbinary')
+  fatbin_options = ['--fatbin-options=' + option for option in fatbin_options]
+
  # The rest of the unrecognized options should be passed to host compiler
  host_compiler_options = [option for option in argv if option not in (src_files + out_file)]

@ -154,6 +157,7 @@ def InvokeNvcc(argv, log=False):
  nvccopts += undefines
  nvccopts += defines
  nvccopts += m_options
+  nvccopts += fatbin_options
  nvccopts += ['--compiler-options="' + " ".join(host_compiler_options) + '"']
  nvccopts += ['-x', 'cu'] + opt + includes + out + ['-c'] + src_files
  # Specify a unique temp directory for nvcc to generate intermediate files,
--- a/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1/clang/bin/crosstool_wrapper_driver_is_not_gcc
+++ b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1/clang/bin/crosstool_wrapper_driver_is_not_gcc
@ -62,7 +62,7 @@ def GetOptionValue(argv, option):

  Args:
    argv: A list of strings, possibly the argv passed to main().
-    option: The option whose value to extract, without the leading '-'.
+    option: The option whose value to extract, with the leading '-'.

  Returns:
    A list of values, either directly following the option,
@ -189,6 +189,8 @@ def InvokeNvcc(argv, log=False):
  nvcc_allowed_std_options = ["c++03", "c++11", "c++14"]
  std_options = ''.join([' -std=' + define
      for define in std_options if define in nvcc_allowed_std_options][-1:])
+  fatbin_options = ''.join([' --fatbin-options=' + option
+      for option in GetOptionValue(argv, '-Xcuda-fatbinary')])

  # The list of source files get passed after the -c option. I don't know of
  # any other reliable way to just get the list of source files to be compiled.
@ -233,6 +235,7 @@ def InvokeNvcc(argv, log=False):
  nvccopts += std_options
  nvccopts += m_options
  nvccopts += warning_options
+  nvccopts += fatbin_options

  if depfiles:
    # Generate the dependency file