diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc
index e6d60e51e75..216fb0a7422 100644
--- a/tensorflow/compiler/xla/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/debug_options_flags.cc
@@ -64,6 +64,9 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_force_host_platform_device_count(1);
   opts.set_xla_gpu_deterministic_reductions(false);
   opts.set_xla_cpu_enable_xprof_traceme(true);
+  // TODO(b/155295372): disable ptxas fallback by default.
+  opts.set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found(true);
+  opts.set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_error(true);
 
   return opts;
 }
@@ -219,340 +222,347 @@ static void AllocateFlags() {
     return true;
   };
 
-  flag_objects = new std::vector<tensorflow::Flag>({
-      tensorflow::Flag(
-          "xla_cpu_enable_fast_math",
-          bool_setter_for(&DebugOptions::set_xla_cpu_enable_fast_math),
-          flag_values->xla_cpu_enable_fast_math(),
-          "Enable unsafe fast-math optimizations in the CPU compiler; "
-          "this may produce faster code at the expense of some accuracy."),
-      tensorflow::Flag(
-          "xla_cpu_fast_math_honor_nans",
-          bool_setter_for(&DebugOptions::set_xla_cpu_fast_math_honor_nans),
-          flag_values->xla_cpu_fast_math_honor_nans(),
-          "When xla_cpu_enable_fast_math is true then this controls whether we "
-          "allow operations to produce NaNs.  Ignored when "
-          "xla_cpu_enable_fast_math is false."),
-      tensorflow::Flag(
-          "xla_cpu_fast_math_honor_infs",
-          bool_setter_for(&DebugOptions::set_xla_cpu_fast_math_honor_infs),
-          flag_values->xla_cpu_fast_math_honor_infs(),
-          "When xla_cpu_enable_fast_math is true then this controls whether we "
-          "allow operations to produce infinites.  Ignored when "
-          "xla_cpu_enable_fast_math is false."),
-      tensorflow::Flag(
-          "xla_cpu_fast_math_honor_division",
-          bool_setter_for(&DebugOptions::set_xla_cpu_fast_math_honor_division),
-          flag_values->xla_cpu_fast_math_honor_division(),
-          "When xla_cpu_enable_fast_math is true then this controls whether "
-          "we forbid to use multiplication by the reciprocal instead of "
-          "division. Ignored when xla_cpu_enable_fast_math is false."),
-      tensorflow::Flag(
-          "xla_cpu_fast_math_honor_functions",
-          bool_setter_for(&DebugOptions::set_xla_cpu_fast_math_honor_functions),
-          flag_values->xla_cpu_fast_math_honor_functions(),
-          "When xla_cpu_enable_fast_math is true then this controls whether "
-          "we forbid to approximate calculations for functions. Ignored when "
-          "xla_cpu_enable_fast_math is false."),
-      tensorflow::Flag(
-          "xla_gpu_enable_fast_min_max",
-          bool_setter_for(&DebugOptions::set_xla_gpu_enable_fast_min_max),
-          flag_values->xla_gpu_enable_fast_min_max(),
-          "Enable fast floating point min/max lowering that does not propagate "
-          "NaNs."),
-      tensorflow::Flag(
-          "xla_llvm_enable_alias_scope_metadata",
-          bool_setter_for(
-              &DebugOptions::set_xla_llvm_enable_alias_scope_metadata),
-          flag_values->xla_llvm_enable_alias_scope_metadata(),
-          "In LLVM-based backends, enable the emission of "
-          "!alias.scope metadata in the generated IR."),
-      tensorflow::Flag(
-          "xla_llvm_enable_noalias_metadata",
-          bool_setter_for(&DebugOptions::set_xla_llvm_enable_noalias_metadata),
-          flag_values->xla_llvm_enable_noalias_metadata(),
-          "In LLVM-based backends, enable the emission of "
-          "!noalias metadata in the generated IR."),
-      tensorflow::Flag(
-          "xla_llvm_enable_invariant_load_metadata",
-          bool_setter_for(
-              &DebugOptions::set_xla_llvm_enable_invariant_load_metadata),
-          flag_values->xla_llvm_enable_invariant_load_metadata(),
-          "In LLVM-based backends, enable the emission of "
-          "!invariant.load metadata in "
-          "the generated IR."),
-      tensorflow::Flag(
-          "xla_llvm_disable_expensive_passes",
-          bool_setter_for(&DebugOptions::set_xla_llvm_disable_expensive_passes),
-          flag_values->xla_llvm_disable_expensive_passes(),
-          "In LLVM-based backends, disable a custom set of "
-          "expensive optimization passes."),
-      tensorflow::Flag(
-          "xla_backend_optimization_level",
-          int32_setter_for(&DebugOptions::set_xla_backend_optimization_level),
-          flag_values->xla_backend_optimization_level(),
-          "Numerical optimization level for the XLA compiler backend."),
-      tensorflow::Flag(
-          "xla_disable_hlo_passes", setter_for_xla_disable_hlo_passes, "",
-          "Comma-separated list of hlo passes to be disabled. These names "
-          "must exactly match the passes' names; no whitespace around "
-          "commas."),
-      tensorflow::Flag(
-          "xla_enable_hlo_passes_only", setter_for_xla_enable_hlo_passes_only,
-          "",
-          "Comma-separated list of hlo passes to be enabled. These names "
-          "must exactly match the passes' names; no whitespace around "
-          "commas. The unspecified passes are all disabled."),
-      tensorflow::Flag(
-          "xla_disable_all_hlo_passes",
-          bool_setter_for(&DebugOptions::set_xla_disable_all_hlo_passes), false,
-          "Disables all HLO passes.  Notes that some passes are necessary for "
-          "correctness and the invariants that must be satisfied by 'fully "
-          "optimized' HLO are different for different devices and may change "
-          "over time.  The only 'guarantee', such as it is, is that if you "
-          "compile XLA and dump the optimized HLO for some graph, you should "
-          "be able to run it again on the same device with the same build of "
-          "XLA."),
-      tensorflow::Flag(
-          "xla_embed_ir_in_executable",
-          bool_setter_for(&DebugOptions::set_xla_embed_ir_in_executable),
-          flag_values->xla_embed_ir_in_executable(),
-          "Embed the compiler IR as a string in the executable."),
-      tensorflow::Flag(
-          "xla_eliminate_hlo_implicit_broadcast",
-          bool_setter_for(
-              &DebugOptions::set_xla_eliminate_hlo_implicit_broadcast),
-          flag_values->xla_eliminate_hlo_implicit_broadcast(),
-          "Eliminate implicit broadcasts when lowering user "
-          "computations to HLO instructions; use explicit "
-          "broadcast instead."),
-      tensorflow::Flag(
-          "xla_cpu_multi_thread_eigen",
-          bool_setter_for(&DebugOptions::set_xla_cpu_multi_thread_eigen),
-          flag_values->xla_cpu_multi_thread_eigen(),
-          "When generating calls to Eigen in the CPU backend, "
-          "use multi-threaded Eigen mode."),
-      tensorflow::Flag("xla_gpu_cuda_data_dir",
-                       flag_values->mutable_xla_gpu_cuda_data_dir(),
-                       "If non-empty, specifies a local directory containing "
-                       "ptxas and nvvm libdevice files; otherwise we use "
-                       "those from runfile directories."),
-      tensorflow::Flag("xla_gpu_ftz",
-                       bool_setter_for(&DebugOptions::set_xla_gpu_ftz),
-                       flag_values->xla_gpu_ftz(),
-                       "If true, flush-to-zero semantics are enabled in the "
-                       "code generated for GPUs."),
-      tensorflow::Flag(
-          "xla_gpu_disable_multi_streaming",
-          bool_setter_for(&DebugOptions::set_xla_gpu_disable_multi_streaming),
-          flag_values->xla_gpu_disable_multi_streaming(),
-          "If true, multi-streaming in the GPU backend is disabled."),
-      tensorflow::Flag(
-          "xla_gpu_max_kernel_unroll_factor",
-          int32_setter_for(&DebugOptions::set_xla_gpu_max_kernel_unroll_factor),
-          flag_values->xla_gpu_max_kernel_unroll_factor(),
-          "Specify the maximum kernel unroll factor for the GPU backend."),
-      tensorflow::Flag("xla_gpu_ptx_file", setter_for_xla_gpu_ptx_file, "",
-                       "If non-empty, specifies a file containing ptx to use. "
-                       "The filename prefix must have the same pattern as PTX "
-                       "dumped by XLA. This allows to match one specific "
-                       "module. General workflow. Get the generated module "
-                       "ptx from XLA. Modify it. Then pass it back via this "
-                       "option."),
-      tensorflow::Flag(
-          "xla_test_all_output_layouts",
-          bool_setter_for(&DebugOptions::set_xla_test_all_output_layouts),
-          flag_values->xla_test_all_output_layouts(),
-          "Let ClientLibraryTestBase::ComputeAndCompare* test "
-          "all permutations of output layouts. For example, with "
-          "a 3D shape, all permutations of the set {0, 1, 2} are "
-          "tried."),
-      tensorflow::Flag(
-          "xla_test_all_input_layouts",
-          bool_setter_for(&DebugOptions::set_xla_test_all_input_layouts),
-          flag_values->xla_test_all_input_layouts(),
-          "Let ClientLibraryTestBase::ComputeAndCompare* test "
-          "all permutations of *input* layouts. For example, for "
-          "2 input arguments with 2D shape and 4D shape, the "
-          "computation will run 2! * 4! times for every possible "
-          "layouts"),
-      tensorflow::Flag(
-          "xla_hlo_profile",
-          bool_setter_for(&DebugOptions::set_xla_hlo_profile),
-          flag_values->xla_hlo_profile(),
-          "Instrument the computation to collect per-HLO cycle counts"),
-      tensorflow::Flag("xla_backend_extra_options",
-                       setter_for_xla_backend_extra_options, "",
-                       "Extra options to pass to a backend; "
-                       "comma-separated list of 'key=val' strings (=val "
-                       "may be omitted); no whitespace around commas."),
-      tensorflow::Flag(
-          "xla_gpu_use_cudnn_batchnorm",
-          bool_setter_for(&DebugOptions::set_xla_gpu_use_cudnn_batchnorm),
-          flag_values->xla_gpu_use_cudnn_batchnorm(),
-          "Allows the GPU backend to implement batchnorm HLOs using cudnn, "
-          "rather than expanding them to a soup of HLOs."),
+  flag_objects = new std::vector<tensorflow::Flag>();
+  flag_objects->reserve(55);
+  // Don't use an initializer list for initializing the vector; this would
+  // create a temporary copy, and exceeds the stack space when compiling with
+  // certain configurations.
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_cpu_enable_fast_math",
+      bool_setter_for(&DebugOptions::set_xla_cpu_enable_fast_math),
+      flag_values->xla_cpu_enable_fast_math(),
+      "Enable unsafe fast-math optimizations in the CPU compiler; this may "
+      "produce faster code at the expense of some accuracy."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_cpu_fast_math_honor_nans",
+      bool_setter_for(&DebugOptions::set_xla_cpu_fast_math_honor_nans),
+      flag_values->xla_cpu_fast_math_honor_nans(),
+      "When xla_cpu_enable_fast_math is true then this controls whether we "
+      "allow operations to produce NaNs.  Ignored when "
+      "xla_cpu_enable_fast_math is false."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_cpu_fast_math_honor_infs",
+      bool_setter_for(&DebugOptions::set_xla_cpu_fast_math_honor_infs),
+      flag_values->xla_cpu_fast_math_honor_infs(),
+      "When xla_cpu_enable_fast_math is true then this controls whether we "
+      "allow operations to produce infinites.  Ignored when "
+      "xla_cpu_enable_fast_math is false."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_cpu_fast_math_honor_division",
+      bool_setter_for(&DebugOptions::set_xla_cpu_fast_math_honor_division),
+      flag_values->xla_cpu_fast_math_honor_division(),
+      "When xla_cpu_enable_fast_math is true then this controls whether we "
+      "forbid to use multiplication by the reciprocal instead of division. "
+      "Ignored when xla_cpu_enable_fast_math is false."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_cpu_fast_math_honor_functions",
+      bool_setter_for(&DebugOptions::set_xla_cpu_fast_math_honor_functions),
+      flag_values->xla_cpu_fast_math_honor_functions(),
+      "When xla_cpu_enable_fast_math is true then this controls whether we "
+      "forbid to approximate calculations for functions. Ignored when "
+      "xla_cpu_enable_fast_math is false."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_enable_fast_min_max",
+      bool_setter_for(&DebugOptions::set_xla_gpu_enable_fast_min_max),
+      flag_values->xla_gpu_enable_fast_min_max(),
+      "Enable fast floating point min/max lowering that does not propagate "
+      "NaNs."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_llvm_enable_alias_scope_metadata",
+      bool_setter_for(&DebugOptions::set_xla_llvm_enable_alias_scope_metadata),
+      flag_values->xla_llvm_enable_alias_scope_metadata(),
+      "In LLVM-based backends, enable the emission of !alias.scope metadata in "
+      "the generated IR."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_llvm_enable_noalias_metadata",
+      bool_setter_for(&DebugOptions::set_xla_llvm_enable_noalias_metadata),
+      flag_values->xla_llvm_enable_noalias_metadata(),
+      "In LLVM-based backends, enable the emission of !noalias metadata in the "
+      "generated IR."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_llvm_enable_invariant_load_metadata",
+      bool_setter_for(
+          &DebugOptions::set_xla_llvm_enable_invariant_load_metadata),
+      flag_values->xla_llvm_enable_invariant_load_metadata(),
+      "In LLVM-based backends, enable the emission of !invariant.load metadata "
+      "in the generated IR."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_llvm_disable_expensive_passes",
+      bool_setter_for(&DebugOptions::set_xla_llvm_disable_expensive_passes),
+      flag_values->xla_llvm_disable_expensive_passes(),
+      "In LLVM-based backends, disable a custom set of expensive optimization "
+      "passes."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_backend_optimization_level",
+      int32_setter_for(&DebugOptions::set_xla_backend_optimization_level),
+      flag_values->xla_backend_optimization_level(),
+      "Numerical optimization level for the XLA compiler backend."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_disable_hlo_passes", setter_for_xla_disable_hlo_passes, "",
+      "Comma-separated list of hlo passes to be disabled. These names must "
+      "exactly match the passes' names; no whitespace around commas."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_enable_hlo_passes_only", setter_for_xla_enable_hlo_passes_only, "",
+      "Comma-separated list of hlo passes to be enabled. These names must "
+      "exactly match the passes' names; no whitespace around commas. The "
+      "unspecified passes are all disabled."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_disable_all_hlo_passes",
+      bool_setter_for(&DebugOptions::set_xla_disable_all_hlo_passes), false,
+      "Disables all HLO passes.  Notes that some passes are necessary for "
+      "correctness and the invariants that must be satisfied by 'fully "
+      "optimized' HLO are different for different devices and may change "
+      "over time.  The only 'guarantee', such as it is, is that if you compile "
+      "XLA and dump the optimized HLO for some graph, you should be able to "
+      "run it again on the same device with the same build of XLA."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_embed_ir_in_executable",
+      bool_setter_for(&DebugOptions::set_xla_embed_ir_in_executable),
+      flag_values->xla_embed_ir_in_executable(),
+      "Embed the compiler IR as a string in the executable."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_eliminate_hlo_implicit_broadcast",
+      bool_setter_for(&DebugOptions::set_xla_eliminate_hlo_implicit_broadcast),
+      flag_values->xla_eliminate_hlo_implicit_broadcast(),
+      "Eliminate implicit broadcasts when lowering user computations to HLO "
+      "instructions; use explicit broadcast instead."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_cpu_multi_thread_eigen",
+      bool_setter_for(&DebugOptions::set_xla_cpu_multi_thread_eigen),
+      flag_values->xla_cpu_multi_thread_eigen(),
+      "When generating calls to Eigen in the CPU backend, use multi-threaded "
+      "Eigen mode."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_cuda_data_dir", flag_values->mutable_xla_gpu_cuda_data_dir(),
+      "If non-empty, specifies a local directory containing ptxas and nvvm "
+      "libdevice files; otherwise we use those from runfile directories."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_ftz", bool_setter_for(&DebugOptions::set_xla_gpu_ftz),
+      flag_values->xla_gpu_ftz(),
+      "If true, flush-to-zero semantics are enabled in the code generated for "
+      "GPUs."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_disable_multi_streaming",
+      bool_setter_for(&DebugOptions::set_xla_gpu_disable_multi_streaming),
+      flag_values->xla_gpu_disable_multi_streaming(),
+      "If true, multi-streaming in the GPU backend is disabled."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_max_kernel_unroll_factor",
+      int32_setter_for(&DebugOptions::set_xla_gpu_max_kernel_unroll_factor),
+      flag_values->xla_gpu_max_kernel_unroll_factor(),
+      "Specify the maximum kernel unroll factor for the GPU backend."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_ptx_file", setter_for_xla_gpu_ptx_file, "",
+      "If non-empty, specifies a file containing ptx to use. The filename "
+      "prefix must have the same pattern as PTX dumped by XLA. This allows to "
+      "match one specific module. General workflow. Get the generated module "
+      "ptx from XLA. Modify it. Then pass it back via this option."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_test_all_output_layouts",
+      bool_setter_for(&DebugOptions::set_xla_test_all_output_layouts),
+      flag_values->xla_test_all_output_layouts(),
+      "Let ClientLibraryTestBase::ComputeAndCompare* test all permutations of "
+      "output layouts. For example, with a 3D shape, all permutations of the "
+      "set {0, 1, 2} are tried."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_test_all_input_layouts",
+      bool_setter_for(&DebugOptions::set_xla_test_all_input_layouts),
+      flag_values->xla_test_all_input_layouts(),
+      "Let ClientLibraryTestBase::ComputeAndCompare* test all permutations of "
+      "*input* layouts. For example, for 2 input arguments with 2D shape and "
+      "4D shape, the computation will run 2! * 4! times for every possible "
+      "layouts"));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_hlo_profile", bool_setter_for(&DebugOptions::set_xla_hlo_profile),
+      flag_values->xla_hlo_profile(),
+      "Instrument the computation to collect per-HLO cycle counts"));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_backend_extra_options", setter_for_xla_backend_extra_options, "",
+      "Extra options to pass to a backend; comma-separated list of 'key=val' "
+      "strings (=val may be omitted); no whitespace around commas."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_use_cudnn_batchnorm",
+      bool_setter_for(&DebugOptions::set_xla_gpu_use_cudnn_batchnorm),
+      flag_values->xla_gpu_use_cudnn_batchnorm(),
+      "Allows the GPU backend to implement batchnorm HLOs using cudnn, rather "
+      "than expanding them to a soup of HLOs."));
+  flag_objects->push_back(
       tensorflow::Flag("xla_cpu_use_mkl_dnn",
                        bool_setter_for(&DebugOptions::set_xla_cpu_use_mkl_dnn),
                        flag_values->xla_cpu_use_mkl_dnn(),
-                       "Generate calls to MKL-DNN in the CPU backend."),
-      tensorflow::Flag(
-          "xla_gpu_crash_on_verification_failures",
-          bool_setter_for(
-              &DebugOptions::set_xla_gpu_crash_on_verification_failures),
-          flag_values->xla_gpu_crash_on_verification_failures(),
-          "Crashes the program on extra verification failures, e.g. cuDNN "
-          "cross checking failures"),
-      tensorflow::Flag(
-          "xla_gpu_autotune_level",
-          int32_setter_for(&DebugOptions::set_xla_gpu_autotune_level),
-          flag_values->xla_gpu_autotune_level(),
-          "Set GEMM and Convolution auto-tuning level."
-          "0 = off; 1 = on; 2 = on+init; 3 = on+init+reinit; 4 = "
-          "on+init+reinit+check."),
-      tensorflow::Flag(
-          "xla_force_host_platform_device_count",
-          int32_setter_for(
-              &DebugOptions::set_xla_force_host_platform_device_count),
-          flag_values->xla_force_host_platform_device_count(),
-          "Force the host platform to pretend that there are these many "
-          "host \"devices\". All of these host devices are backed by the same"
-          "threadpool.  Setting this to anything other than 1 can increase "
-          "overhead from context switching but we let the user override this "
-          "behavior to help run tests on the host that run models in parallel "
-          "across multiple devices."),
-      tensorflow::Flag(
-          "xla_gpu_disable_gpuasm_optimizations",
-          bool_setter_for(
-              &DebugOptions::set_xla_gpu_disable_gpuasm_optimizations),
-          flag_values->xla_gpu_disable_gpuasm_optimizations(),
-          "In XLA:GPU run ptxas in -O0 (default is -O3)."),
-      tensorflow::Flag(
-          "xla_fuel", setter_for_xla_fuel, /*default_value_for_display=*/"",
-          "Sets compiler fuel, useful for bisecting bugs in passes.  Format "
-          "--xla_fuel=PASS1=NUM1,PASS2=NUM2,..."),
-
-      tensorflow::Flag(
-          "xla_dump_to", string_setter_for(&DebugOptions::set_xla_dump_to),
-          flag_values->xla_dump_to(),
-          "Directory into which debugging data is written.  If not specified "
-          "but another dumping flag is passed, data will be written to stdout. "
-          " To explicitly write to stdout, set this to \"-\".  The values "
-          "\"sponge\" and \"test_undeclared_outputs_dir\" have a special "
-          "meaning: They cause us to dump into the directory specified by the "
-          "environment variable TEST_UNDECLARED_OUTPUTS_DIR."),
-      tensorflow::Flag(
-          "xla_dump_hlo_as_text",
-          bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_text),
-          flag_values->xla_dump_hlo_as_text(),
-          "Dumps HLO modules as text before and after optimizations.  Results "
-          "are written to the --xla_dump_to dir, or, if no dir is specified, "
-          "to stdout."),
-      tensorflow::Flag(
-          "xla_dump_hlo_as_proto",
-          bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_proto),
-          flag_values->xla_dump_hlo_as_proto(),
-          "Dumps HLO modules as HloProtos to the directory specified by "
-          "--xla_dump_to."),
-      tensorflow::Flag(
-          "xla_dump_hlo_as_dot",
-          bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_dot),
-          flag_values->xla_dump_hlo_as_dot(),
-          "Dumps HLO modules rendered as dot files to the directory "
-          "specified by --xla_dump_to."),
+                       "Generate calls to MKL-DNN in the CPU backend."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_crash_on_verification_failures",
+      bool_setter_for(
+          &DebugOptions::set_xla_gpu_crash_on_verification_failures),
+      flag_values->xla_gpu_crash_on_verification_failures(),
+      "Crashes the program on extra verification failures, e.g. cuDNN cross "
+      "checking failures"));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_autotune_level",
+      int32_setter_for(&DebugOptions::set_xla_gpu_autotune_level),
+      flag_values->xla_gpu_autotune_level(),
+      "Set GEMM and Convolution auto-tuning level. 0 = off; 1 = on; 2 = "
+      "on+init; 3 = on+init+reinit; 4 = on+init+reinit+check."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_force_host_platform_device_count",
+      int32_setter_for(&DebugOptions::set_xla_force_host_platform_device_count),
+      flag_values->xla_force_host_platform_device_count(),
+      "Force the host platform to pretend that there are these many host "
+      "\"devices\". All of these host devices are backed by the same "
+      "threadpool. Setting this to anything other than 1 can increase overhead "
+      "from context switching but we let the user override this behavior to "
+      "help run tests on the host that run models in parallel across multiple "
+      "devices."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_disable_gpuasm_optimizations",
+      bool_setter_for(&DebugOptions::set_xla_gpu_disable_gpuasm_optimizations),
+      flag_values->xla_gpu_disable_gpuasm_optimizations(),
+      "In XLA:GPU run ptxas in -O0 (default is -O3)."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_fuel", setter_for_xla_fuel, /*default_value_for_display=*/"",
+      "Sets compiler fuel, useful for bisecting bugs in passes.  Format "
+      "--xla_fuel=PASS1=NUM1,PASS2=NUM2,..."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_dump_to", string_setter_for(&DebugOptions::set_xla_dump_to),
+      flag_values->xla_dump_to(),
+      "Directory into which debugging data is written. If not specified but "
+      "another dumping flag is passed, data will be written to stdout. To "
+      "explicitly write to stdout, set this to \"-\". The values \"sponge\" "
+      "and \"test_undeclared_outputs_dir\" have a special meaning: They cause "
+      "us to dump into the directory specified by the environment variable "
+      "TEST_UNDECLARED_OUTPUTS_DIR."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_dump_hlo_as_text",
+      bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_text),
+      flag_values->xla_dump_hlo_as_text(),
+      "Dumps HLO modules as text before and after optimizations. Results are "
+      "written to the --xla_dump_to dir, or, if no dir is specified, to "
+      "stdout."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_dump_hlo_as_proto",
+      bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_proto),
+      flag_values->xla_dump_hlo_as_proto(),
+      "Dumps HLO modules as HloProtos to the directory specified by "
+      "--xla_dump_to."));
+  flag_objects->push_back(
+      tensorflow::Flag("xla_dump_hlo_as_dot",
+                       bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_dot),
+                       flag_values->xla_dump_hlo_as_dot(),
+                       "Dumps HLO modules rendered as dot files to the "
+                       "directory specified by --xla_dump_to."));
+  flag_objects->push_back(
       tensorflow::Flag("xla_dump_hlo_as_html",
                        bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_html),
                        flag_values->xla_dump_hlo_as_html(),
                        "Dumps HLO modules rendered as HTML files to the "
-                       "directory specified by --xla_dump_to."),
-      tensorflow::Flag(
-          "xla_dump_hlo_as_url",
-          bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_url),
-          flag_values->xla_dump_hlo_as_url(),
-          "Tries to dump HLO modules rendered as URLs to stdout (and also to "
-          "the directory specified by --xla_dump_to). This is not implemented "
-          "by default; you need to add a plugin which calls "
-          "RegisterGraphToURLRenderer()."),
-      tensorflow::Flag(
-          "xla_dump_hlo_snapshots",
-          bool_setter_for(&DebugOptions::set_xla_dump_hlo_snapshots),
-          flag_values->xla_dump_hlo_snapshots(),
-          "Every time an HLO module is run, dumps an HloSnapshot to the "
-          "directory specified by --xla_dump_to."),
-      tensorflow::Flag(
-          "xla_dump_hlo_module_re",
-          string_setter_for(&DebugOptions::set_xla_dump_hlo_module_re),
-          flag_values->xla_dump_hlo_module_re(),
-          "Limits dumping only to modules which match this regular expression. "
-          " Default is to dump all modules."),
-      tensorflow::Flag(
-          "xla_dump_hlo_pass_re",
-          string_setter_for(&DebugOptions::set_xla_dump_hlo_pass_re),
-          flag_values->xla_dump_hlo_pass_re(),
-          "If specified, dumps HLO before and after optimization passes which "
-          "match this regular expression, in addition to dumping at the very "
-          "beginning and end of compilation."),
-      tensorflow::Flag(
-          "xla_dump_include_timestamp",
-          bool_setter_for(&DebugOptions::set_xla_dump_include_timestamp),
-          flag_values->xla_dump_include_timestamp(),
-          "If specified, includes a timestamp in the dumped filenames."),
-      tensorflow::Flag(
-          "xla_dump_max_hlo_modules",
-          int32_setter_for(&DebugOptions::set_xla_dump_max_hlo_modules),
-          flag_values->xla_dump_max_hlo_modules(),
-          "Max number of hlo module dumps in a directory. Set to < 0 for "
-          "unbounded."),
-      tensorflow::Flag(
-          "xla_hlo_graph_addresses",
-          bool_setter_for(&DebugOptions::set_xla_hlo_graph_addresses),
-          flag_values->xla_hlo_graph_addresses(),
-          "When rendering graphs (--xla_dump_hlo_as_{dot,html,url}), displays "
-          "the address in memory of each HloInstruction object."),
-      tensorflow::Flag(
-          "xla_hlo_graph_sharding_color",
-          bool_setter_for(&DebugOptions::set_xla_hlo_graph_sharding_color),
-          flag_values->xla_hlo_graph_sharding_color(),
-          "Assign colors based on sharding assignments when generating the "
-          "HLO graphs."),
-      tensorflow::Flag(
-          "xla_allow_excess_precision",
-          bool_setter_for(&DebugOptions::set_xla_allow_excess_precision),
-          flag_values->xla_allow_excess_precision(),
-          "Allow xla to increase the output precision of an instruction."),
-      tensorflow::Flag(
-          "xla_gpu_force_conv_nchw",
-          bool_setter_for(&DebugOptions::set_xla_gpu_force_conv_nchw),
-          flag_values->xla_gpu_force_conv_nchw(),
-          "For cuDNN convolutions, always NCHW layouts."),
-      tensorflow::Flag("xla_gpu_algorithm_blacklist_path",
-                       string_setter_for(
-                           &DebugOptions::set_xla_gpu_algorithm_blacklist_path),
-                       flag_values->xla_gpu_algorithm_blacklist_path(),
-                       "An AlgorithmBlacklist text proto file as a blacklist "
-                       "of convolutions to avoid to use."),
-      tensorflow::Flag(
-          "xla_gpu_deterministic_reductions",
-          bool_setter_for(&DebugOptions::set_xla_gpu_deterministic_reductions),
-          flag_values->xla_gpu_deterministic_reductions(),
-          "Always run deterministic reductions on GPU"),
-      tensorflow::Flag(
-          "xla_tpu_detect_nan",
-          bool_setter_for(&DebugOptions::set_xla_tpu_detect_nan),
-          flag_values->xla_tpu_detect_nan(),
-          "Trigger error on execution on TPU if a NAN value is detected"),
-      tensorflow::Flag(
-          "xla_tpu_detect_inf",
-          bool_setter_for(&DebugOptions::set_xla_tpu_detect_inf),
-          flag_values->xla_tpu_detect_inf(),
-          "Trigger error on execution on TPU if a INF value is detected"),
-      tensorflow::Flag(
-          "xla_cpu_enable_xprof_traceme",
-          bool_setter_for(&DebugOptions::set_xla_cpu_enable_xprof_traceme),
-          flag_values->xla_cpu_enable_xprof_traceme(),
-          "If true, XLA CPU generates code to call "
-          "TraceMe::Activity{Start|End} around HLO operations."),
-  });
+                       "directory specified by --xla_dump_to."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_dump_hlo_as_url",
+      bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_url),
+      flag_values->xla_dump_hlo_as_url(),
+      "Tries to dump HLO modules rendered as URLs to stdout (and also to the "
+      "directory specified by --xla_dump_to). This is not implemented by "
+      "default; you need to add a plugin which calls "
+      "RegisterGraphToURLRenderer()."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_dump_hlo_snapshots",
+      bool_setter_for(&DebugOptions::set_xla_dump_hlo_snapshots),
+      flag_values->xla_dump_hlo_snapshots(),
+      "Every time an HLO module is run, dumps an HloSnapshot to the directory "
+      "specified by --xla_dump_to."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_dump_hlo_module_re",
+      string_setter_for(&DebugOptions::set_xla_dump_hlo_module_re),
+      flag_values->xla_dump_hlo_module_re(),
+      "Limits dumping only to modules which match this regular expression. "
+      "Default is to dump all modules."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_dump_hlo_pass_re",
+      string_setter_for(&DebugOptions::set_xla_dump_hlo_pass_re),
+      flag_values->xla_dump_hlo_pass_re(),
+      "If specified, dumps HLO before and after optimization passes which "
+      "match this regular expression, in addition to dumping at the very "
+      "beginning and end of compilation."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_dump_include_timestamp",
+      bool_setter_for(&DebugOptions::set_xla_dump_include_timestamp),
+      flag_values->xla_dump_include_timestamp(),
+      "If specified, includes a timestamp in the dumped filenames."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_dump_max_hlo_modules",
+      int32_setter_for(&DebugOptions::set_xla_dump_max_hlo_modules),
+      flag_values->xla_dump_max_hlo_modules(),
+      "Max number of hlo module dumps in a directory. Set to < 0 for "
+      "unbounded."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_hlo_graph_addresses",
+      bool_setter_for(&DebugOptions::set_xla_hlo_graph_addresses),
+      flag_values->xla_hlo_graph_addresses(),
+      "When rendering graphs (--xla_dump_hlo_as_{dot,html,url}), displays "
+      "the address in memory of each HloInstruction object."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_hlo_graph_sharding_color",
+      bool_setter_for(&DebugOptions::set_xla_hlo_graph_sharding_color),
+      flag_values->xla_hlo_graph_sharding_color(),
+      "Assign colors based on sharding assignments when generating the HLO "
+      "graphs."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_allow_excess_precision",
+      bool_setter_for(&DebugOptions::set_xla_allow_excess_precision),
+      flag_values->xla_allow_excess_precision(),
+      "Allow xla to increase the output precision of an instruction."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_force_conv_nchw",
+      bool_setter_for(&DebugOptions::set_xla_gpu_force_conv_nchw),
+      flag_values->xla_gpu_force_conv_nchw(),
+      "For cuDNN convolutions, always NCHW layouts."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_algorithm_blacklist_path",
+      string_setter_for(&DebugOptions::set_xla_gpu_algorithm_blacklist_path),
+      flag_values->xla_gpu_algorithm_blacklist_path(),
+      "An AlgorithmBlacklist text proto file as a blacklist of convolutions to "
+      "avoid to use."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_deterministic_reductions",
+      bool_setter_for(&DebugOptions::set_xla_gpu_deterministic_reductions),
+      flag_values->xla_gpu_deterministic_reductions(),
+      "Always run deterministic reductions on GPU"));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_tpu_detect_nan",
+      bool_setter_for(&DebugOptions::set_xla_tpu_detect_nan),
+      flag_values->xla_tpu_detect_nan(),
+      "Trigger error on execution on TPU if a NAN value is detected"));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_tpu_detect_inf",
+      bool_setter_for(&DebugOptions::set_xla_tpu_detect_inf),
+      flag_values->xla_tpu_detect_inf(),
+      "Trigger error on execution on TPU if a INF value is detected"));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_cpu_enable_xprof_traceme",
+      bool_setter_for(&DebugOptions::set_xla_cpu_enable_xprof_traceme),
+      flag_values->xla_cpu_enable_xprof_traceme(),
+      "If true, XLA CPU generates code to call "
+      "TraceMe::Activity{Start|End} around HLO operations."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found",
+      bool_setter_for(
+          &DebugOptions::
+              set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found),
+      flag_values->xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found(),
+      "If true, XLA GPU falls back to the driver if ptxas is not found. Note "
+      "that falling back to the driver can have drawbacks like using more "
+      "memory and/or other bugs during compilation, so we recommend setting "
+      "this flag to false."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_unsafe_fallback_to_driver_on_ptxas_error",
+      bool_setter_for(
+          &DebugOptions::set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_error),
+      flag_values->xla_gpu_unsafe_fallback_to_driver_on_ptxas_error(),
+      "If true, XLA GPU falls back to the driver if there is an error when "
+      "running ptxas. Note that falling back to the driver can have drawbacks "
+      "like using more memory and/or other bugs during compilation, so we "
+      "recommend setting this flag to false."));
   ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", *flag_objects);
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index 0906c71064e..0196267d904 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -402,10 +402,25 @@ std::vector<uint8> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
                   "using $PATH.",
                   hlo_module_config);
             }
+            CHECK(hlo_module_config.debug_options()
+                      .xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found())
+                << "There was an error when trying to compile ptx into sass "
+                   "code. If you want to try falling back to the GPU driver to "
+                   "jit compile ptx, you can use the flag "
+                   "--xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found."
+                   " Use at your own risk though, it has known drawbacks like "
+                   "increased memory consumption.";
           } else {
             LOG(ERROR) << "Error during compilation of ptx to sass: "
-                       << maybe_cubin.status()
-                       << ". Falling back to the GPU driver.";
+                       << maybe_cubin.status();
+            CHECK(hlo_module_config.debug_options()
+                      .xla_gpu_unsafe_fallback_to_driver_on_ptxas_error())
+                << "There was an error when trying to compile ptx into sass "
+                   "code. If you want to try falling back to the GPU driver to "
+                   "jit compile ptx, you can use the flag "
+                   "--xla_gpu_unsafe_fallback_to_driver_on_ptxas_error."
+                   " Use at your own risk though, it has known drawbacks like "
+                   "increased memory consumption.";
           }
 
           // We're going to use the driver to JIT our PTX->SASS, so warn if
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index 826876ed9cb..a015af674af 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -272,7 +272,15 @@ message DebugOptions {
   // True if TraceMe annotations are enabled for XLA:CPU.
   bool xla_cpu_enable_xprof_traceme = 137;
 
-  // Next id: 138
+  // It is usually preferable to not fallback to the driver; it can consume more
+  // memory, or have bugs.
+  bool xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found = 138;
+
+  // It is usually preferable to not fallback to the driver; it can consume more
+  // memory, or have bugs.
+  bool xla_gpu_unsafe_fallback_to_driver_on_ptxas_error = 139;
+
+  // Next id: 140
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.