Prefixing TensorFlow thread annotation macros with TF_.

PiperOrigin-RevId: 299110761 Change-Id: I66ecaa9d01dc441f091888bef3f24d220e9180c5
2020-03-05 08:38:16 -08:00 · 2020-03-05 08:38:16 -08:00 · 83d65b152b
commit 83d65b152b
parent 5536be153f
327 changed files with 1766 additions and 1707 deletions
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@ -774,7 +774,7 @@ extern "C" {
 static TF_OperationDescription* TF_NewOperationLocked(TF_Graph* graph,
                                                      const char* op_type,
                                                      const char* oper_name)
-    EXCLUSIVE_LOCKS_REQUIRED(graph->mu) {
+    TF_EXCLUSIVE_LOCKS_REQUIRED(graph->mu) {
  return new TF_OperationDescription(graph, op_type, oper_name);
 }
@ -1032,7 +1032,7 @@ void TF_SetAttrValueProto(TF_OperationDescription* desc, const char* attr_name,
 static TF_Operation* TF_FinishOperationLocked(TF_OperationDescription* desc,
                                              TF_Status* status)
-    EXCLUSIVE_LOCKS_REQUIRED(desc->graph->mu) {
+    TF_EXCLUSIVE_LOCKS_REQUIRED(desc->graph->mu) {
  Node* ret = nullptr;
  if (desc->graph->name_map.count(desc->node_builder.node_name())) {
@ -1706,7 +1706,7 @@ static void GraphImportGraphDefLocked(TF_Graph* graph, const GraphDef& def,
                                      const TF_ImportGraphDefOptions* opts,
                                      TF_ImportGraphDefResults* tf_results,
                                      TF_Status* status)
-    EXCLUSIVE_LOCKS_REQUIRED(graph->mu) {
+    TF_EXCLUSIVE_LOCKS_REQUIRED(graph->mu) {
  const int last_node_id = graph->graph.num_node_ids();
  tensorflow::ImportGraphDefResults results;
  status->status = tensorflow::ImportGraphDef(opts->opts, def, &graph->graph,
--- a/tensorflow/c/c_api_function.cc
+++ b/tensorflow/c/c_api_function.cc
@ -51,7 +51,7 @@ Status ProcessInputs(
    const TF_Graph* fn_body, const char* fn_name, int ninputs,
    const TF_Output* inputs, std::vector<OutputTensor>* input_tensors,
    std::unordered_map<const Node*, std::vector<int>>* input_nodes)
-    EXCLUSIVE_LOCKS_REQUIRED(fn_body->mu) {
+    TF_EXCLUSIVE_LOCKS_REQUIRED(fn_body->mu) {
  input_tensors->reserve(ninputs);
  for (int i = 0; i < ninputs; ++i) {
    Node* node = &inputs[i].oper->node;
@ -87,7 +87,7 @@ Status ProcessInputs(
 Status ProcessOutputs(const TF_Graph* fn_body, const char* fn_name,
                      int noutputs, const TF_Output* outputs,
                      std::vector<OutputTensor>* output_tensors)
-    EXCLUSIVE_LOCKS_REQUIRED(fn_body->mu) {
+    TF_EXCLUSIVE_LOCKS_REQUIRED(fn_body->mu) {
  output_tensors->reserve(noutputs);
  for (int i = 0; i < noutputs; ++i) {
    Node* node = &outputs[i].oper->node;
@ -111,7 +111,7 @@ Status ComputeBodyNodes(
    const TF_Operation* const* opers,
    const std::unordered_map<const Node*, std::vector<int>>& input_nodes,
    std::vector<const Node*>* body_nodes)
-    EXCLUSIVE_LOCKS_REQUIRED(fn_body->mu) {
+    TF_EXCLUSIVE_LOCKS_REQUIRED(fn_body->mu) {
  if (num_opers == -1) {
    for (const Node* node : fn_body->graph.op_nodes()) {
      const auto& iter = input_nodes.find(node);
--- a/tensorflow/c/c_api_internal.h
+++ b/tensorflow/c/c_api_internal.h
@ -71,14 +71,14 @@ struct TF_Graph {
  TF_Graph();
  tensorflow::mutex mu;
-  tensorflow::Graph graph GUARDED_BY(mu);
+  tensorflow::Graph graph TF_GUARDED_BY(mu);
  // Runs shape inference.
-  tensorflow::ShapeRefiner refiner GUARDED_BY(mu);
+  tensorflow::ShapeRefiner refiner TF_GUARDED_BY(mu);
  // Maps from name of an operation to the Node* in 'graph'.
  std::unordered_map<tensorflow::string, tensorflow::Node*> name_map
-      GUARDED_BY(mu);
+      TF_GUARDED_BY(mu);
  // The keys of this map are all the active sessions using this graph. Each
  // value records whether the graph has been mutated since the corresponding
@ -94,8 +94,8 @@ struct TF_Graph {
  // TODO(b/74949947): mutations currently trigger a warning instead of a bad
  // status, this should be reverted when possible.
  tensorflow::gtl::FlatMap<TF_Session*, tensorflow::string> sessions
-      GUARDED_BY(mu);
+      TF_GUARDED_BY(mu);
-  bool delete_requested GUARDED_BY(mu);  // set true by TF_DeleteGraph
+  bool delete_requested TF_GUARDED_BY(mu);  // set true by TF_DeleteGraph
  // Used to link graphs contained in TF_WhileParams to the parent graph that
  // will eventually contain the full while loop.
@ -123,7 +123,7 @@ struct TF_Session {
  tensorflow::Session* session;
  TF_Graph* const graph;
-  tensorflow::mutex mu ACQUIRED_AFTER(TF_Graph::mu);
+  tensorflow::mutex mu TF_ACQUIRED_AFTER(TF_Graph::mu);
  int last_num_graph_nodes;
  // If true, TF_SessionRun and similar methods will call
@ -169,9 +169,9 @@ struct TF_ApiDefMap {
  }
 #if !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
-  tensorflow::ApiDefMap api_def_map GUARDED_BY(lock);
+  tensorflow::ApiDefMap api_def_map TF_GUARDED_BY(lock);
 #endif  // !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
-  bool update_docs_called GUARDED_BY(lock);
+  bool update_docs_called TF_GUARDED_BY(lock);
  tensorflow::mutex lock;
 };
@ -210,10 +210,10 @@ void TF_GraphSetOutputHandleShapesAndTypes(TF_Graph* graph, TF_Output output,
 void RecordMutation(TF_Graph* graph, const TF_Operation& op,
                    const char* mutation_type)
-    EXCLUSIVE_LOCKS_REQUIRED(graph->mu);
+    TF_EXCLUSIVE_LOCKS_REQUIRED(graph->mu);
 bool ExtendSessionGraphHelper(TF_Session* session, TF_Status* status)
-    LOCKS_EXCLUDED(session->graph->mu, session->mu);
+    TF_LOCKS_EXCLUDED(session->graph->mu, session->mu);
 std::string getTF_OutputDebugString(TF_Output node);
--- a/tensorflow/cc/client/client_session.cc
+++ b/tensorflow/cc/client/client_session.cc
@ -41,7 +41,7 @@ class ClientSession::Impl {
  std::shared_ptr<Graph> graph_;
  mutable mutex mu_;
-  mutable int last_num_graph_nodes_ GUARDED_BY(mu_) = 0;
+  mutable int last_num_graph_nodes_ TF_GUARDED_BY(mu_) = 0;
 };
 ClientSession::ClientSession(const Scope& scope, const string& target)
--- a/tensorflow/cc/training/coordinator.h
+++ b/tensorflow/cc/training/coordinator.h
@ -114,14 +114,14 @@ class Coordinator {
  condition_variable wait_for_stop_;
  mutex mu_;
-  bool should_stop_ GUARDED_BY(mu_);
+  bool should_stop_ TF_GUARDED_BY(mu_);
  mutex status_lock_;
-  Status status_ GUARDED_BY(status_lock_);
+  Status status_ TF_GUARDED_BY(status_lock_);
  mutable mutex runners_lock_;
  std::vector<std::unique_ptr<RunnerInterface>> runners_
-      GUARDED_BY(runners_lock_);
+      TF_GUARDED_BY(runners_lock_);
  TF_DISALLOW_COPY_AND_ASSIGN(Coordinator);
 };
--- a/tensorflow/cc/training/queue_runner.h
+++ b/tensorflow/cc/training/queue_runner.h
@ -119,8 +119,8 @@ class QueueRunner : public RunnerInterface {
  std::unique_ptr<thread::ThreadPool> thread_pool_;
  mutex mu_;
  int runs_ = 0;
-  Status status_ GUARDED_BY(mu_);
+  Status status_ TF_GUARDED_BY(mu_);
-  Status enqueue_status_ GUARDED_BY(mu_);
+  Status enqueue_status_ TF_GUARDED_BY(mu_);
  std::unique_ptr<BlockingCounter> counter_;
  Coordinator* coord_;
@ -131,7 +131,7 @@ class QueueRunner : public RunnerInterface {
  std::vector<std::function<void(Status)>> callbacks_;
  mutable std::unique_ptr<mutex> cg_mu_;
-  std::unique_ptr<CostGraphDef> cost_graph_ GUARDED_BY(cg_mu_);
+  std::unique_ptr<CostGraphDef> cost_graph_ TF_GUARDED_BY(cg_mu_);
  RunOptions run_options_;
 };
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@ -172,8 +172,9 @@ class XlaExecutableClosureStore {
 private:
  mutex mutex_;
-  int64 key_counter_ GUARDED_BY(mutex_);
+  int64 key_counter_ TF_GUARDED_BY(mutex_);
-  absl::flat_hash_map<KeyT, XlaExecutableClosure> closures_ GUARDED_BY(mutex_);
+  absl::flat_hash_map<KeyT, XlaExecutableClosure> closures_
      TF_GUARDED_BY(mutex_);
  TF_DISALLOW_COPY_AND_ASSIGN(XlaExecutableClosureStore);
 };
--- a/tensorflow/compiler/jit/kernels/xla_ops.h
+++ b/tensorflow/compiler/jit/kernels/xla_ops.h
@ -165,7 +165,8 @@ class XlaCompileOp : public OpKernel {
  // error when compiling the cluster this _XlaCompile is supposed to compile.
  // If `cannot_compile_cluster_` is true then we avoid compiling this cluster
  // on any future calls to _XlaCompile.
-  bool cannot_compile_cluster_ GUARDED_BY(cannot_compile_cluster_mu_) = false;
+  bool cannot_compile_cluster_ TF_GUARDED_BY(cannot_compile_cluster_mu_) =
      false;
  mutex cannot_compile_cluster_mu_;
 };
--- a/tensorflow/compiler/jit/xla_activity_listener.cc
+++ b/tensorflow/compiler/jit/xla_activity_listener.cc
@ -18,13 +18,15 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "tensorflow/compiler/jit/xla_activity.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 namespace tensorflow {
 namespace {
 // The list of all registered `XlaActivityListener`s.
 struct XlaActivityListenerList {
  absl::Mutex mutex;
-  std::vector<std::unique_ptr<XlaActivityListener>> listeners GUARDED_BY(mutex);
+  std::vector<std::unique_ptr<XlaActivityListener>> listeners
      TF_GUARDED_BY(mutex);
 };
 void FlushAllListeners();
--- a/tensorflow/compiler/jit/xla_compilation_cache.h
+++ b/tensorflow/compiler/jit/xla_compilation_cache.h
@ -151,19 +151,19 @@ class XlaCompilationCache : public ResourceBase {
    int64 request_count = 0;
    // Did compilation succeed?
-    Status compilation_status GUARDED_BY(mu);
+    Status compilation_status TF_GUARDED_BY(mu);
    // Output of the XlaCompiler.
-    XlaCompiler::CompilationResult compilation_result GUARDED_BY(mu);
+    XlaCompiler::CompilationResult compilation_result TF_GUARDED_BY(mu);
    // The XLA executable compiled from <computation>. May be null if no
    // executable has been built.
-    std::unique_ptr<xla::LocalExecutable> executable GUARDED_BY(mu);
+    std::unique_ptr<xla::LocalExecutable> executable TF_GUARDED_BY(mu);
  };
  mutex compile_cache_mu_;
  absl::flat_hash_map<Signature, std::unique_ptr<Entry>, Signature::Hash> cache_
-      GUARDED_BY(compile_cache_mu_);
+      TF_GUARDED_BY(compile_cache_mu_);
  struct ClusterCompileStats {
    // Number of times the cluster has been (re-)compiled.
@ -185,7 +185,7 @@ class XlaCompilationCache : public ResourceBase {
  // Maps cluster names to compilation statistics for said cluster.
  absl::flat_hash_map<string, ClusterCompileStats> cluster_compile_stats_
-      GUARDED_BY(cluster_compile_stats_mu_);
+      TF_GUARDED_BY(cluster_compile_stats_mu_);
  // The number of times a lazy compilation must be requested for a specific
  // signature before  we attempt to compile it.
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@ -83,7 +83,7 @@ class XlaDeviceAllocatorState {
  std::unordered_map<std::pair<const xla::Backend*, int>,
                     std::unique_ptr<XlaDeviceAllocator>,
                     hash<std::pair<const xla::Backend*, int>>>
-      allocators_ GUARDED_BY(allocator_mutex_);
+      allocators_ TF_GUARDED_BY(allocator_mutex_);
  TF_DISALLOW_COPY_AND_ASSIGN(XlaDeviceAllocatorState);
 };
--- a/tensorflow/compiler/jit/xla_device.h
+++ b/tensorflow/compiler/jit/xla_device.h
@ -137,7 +137,7 @@ class XlaDevice : public LocalDevice {
  ~XlaDevice() override;
  Allocator* GetAllocator(AllocatorAttributes attr) override
-      LOCKS_EXCLUDED(mu_);
+      TF_LOCKS_EXCLUDED(mu_);
  void Compute(OpKernel* op_kernel, OpKernelContext* context) override;
  void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
                    AsyncOpKernel::DoneCallback done) override;
@ -145,18 +145,18 @@ class XlaDevice : public LocalDevice {
  void Sync(const DoneCallback& done) override;
  Status TryGetDeviceContext(DeviceContext** out_context) override
-      LOCKS_EXCLUDED(mu_);
+      TF_LOCKS_EXCLUDED(mu_);
  Status MakeTensorFromProto(const TensorProto& tensor_proto,
                             const AllocatorAttributes alloc_attrs,
-                             Tensor* tensor) override LOCKS_EXCLUDED(mu_);
+                             Tensor* tensor) override TF_LOCKS_EXCLUDED(mu_);
  // Allocate tensor on fast memory space. This is only applied to the new TPU
  // hardware which has faster read/write memory. If the hardware doesn't
  // have such memory space, we fallback to the ordinary memory space.
  Status MakeFastMemTensorFromProto(const TensorProto& tensor_proto,
                                    const AllocatorAttributes alloc_attrs,
-                                    Tensor* tensor) LOCKS_EXCLUDED(mu_);
+                                    Tensor* tensor) TF_LOCKS_EXCLUDED(mu_);
  const Metadata& metadata() { return xla_metadata_; }
@ -166,34 +166,35 @@ class XlaDevice : public LocalDevice {
  //
  // TODO(b/111859745): The Eager context needs to call this method to recover
  // from failures.
-  Status EnsureDeviceContextOk() LOCKS_EXCLUDED(mu_);
+  Status EnsureDeviceContextOk() TF_LOCKS_EXCLUDED(mu_);
  // Instructs this XlaDevice to set a GpuDeviceInfo, which holds extra
  // information for GPU and TPU devices.
-  Status UseGpuDeviceInfo() LOCKS_EXCLUDED(mu_);
+  Status UseGpuDeviceInfo() TF_LOCKS_EXCLUDED(mu_);
  // Instructs this XlaDevice to return 'sync_on_completion' for
  // AllowsSyncOnCompletion().
-  void SetAllowsSyncOnCompletion(bool sync_on_completion) LOCKS_EXCLUDED(mu_);
+  void SetAllowsSyncOnCompletion(bool sync_on_completion)
-  bool AllowsSyncOnCompletion() const override LOCKS_EXCLUDED(mu_);
+      TF_LOCKS_EXCLUDED(mu_);
  bool AllowsSyncOnCompletion() const override TF_LOCKS_EXCLUDED(mu_);
  // Installs an error handling callback when RefreshStatus sees !status.ok().
  void SetHandleDeviceErrorCallback(std::function<Status()> callback);
-  Status RefreshStatus() override LOCKS_EXCLUDED(mu_);
+  Status RefreshStatus() override TF_LOCKS_EXCLUDED(mu_);
 private:
  xla::StatusOr<xla::LocalClient*> GetOrCreateClient() const;
  Allocator* GetAllocatorLocked(AllocatorAttributes attr)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
  Status EnsureStreamOkLocked(xla::Backend* backend, const string& name,
                              std::shared_ptr<se::Stream>* stream,
                              bool* stream_was_changed)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
  // Return a pair of device context, the second one is fast_mem device context.
  xla::StatusOr<std::pair<XlaDeviceContext*, XlaDeviceContext*>>
-  GetDeviceContextLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  GetDeviceContextLocked() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
  static Status GetMetadataFromDevice(DeviceBase* device,
                                      const XlaDevice::Metadata** metadata);
@ -218,13 +219,13 @@ class XlaDevice : public LocalDevice {
  // Intra-op threads to spawn (from SessionOptions).
  const int intra_op_parallelism_threads_;
  // Memory allocator associated with this device.
-  Allocator* xla_allocator_ GUARDED_BY(mu_) = nullptr;  // Not owned.
+  Allocator* xla_allocator_ TF_GUARDED_BY(mu_) = nullptr;  // Not owned.
  // Stream associated with this device. Operations enqueued on this
  // stream are executed on the device. Operations include data
  // copying back and forth between CPU and the device, and
  // computations enqueued by XLA.
-  std::shared_ptr<se::Stream> stream_ GUARDED_BY(mu_);
+  std::shared_ptr<se::Stream> stream_ TF_GUARDED_BY(mu_);
  // If false, only stream_ is valid and all computation and transfers use
  // stream_. If true, computation is performed by stream_ and transfers are
  // performed by host_to_device/device_to_device stream or borrowing a stream
@ -232,36 +233,36 @@ class XlaDevice : public LocalDevice {
  const bool use_multiple_streams_;
  // If use_multiple_streams_, host to device transfers are performed using this
  // stream.
-  std::shared_ptr<se::Stream> host_to_device_stream_ GUARDED_BY(mu_);
+  std::shared_ptr<se::Stream> host_to_device_stream_ TF_GUARDED_BY(mu_);
  // If use_multiple_streams_, transfers between different devices are performed
  // using these streams.
  std::vector<std::shared_ptr<se::Stream>> device_to_device_streams_
-      GUARDED_BY(mu_);
+      TF_GUARDED_BY(mu_);
  const XlaCompiler::ShapeRepresentationFn shape_representation_fn_;
  // The device context accessed by all users of the XlaDevice, set by calls to
  // EnsureDeviceContextOk. If gpu_device_info_ is non-null, this pointer is
  // also filled in to that struct. XlaDeviceContext is a ref-counted object.
-  XlaDeviceContext* device_context_ GUARDED_BY(mu_) = nullptr;
+  XlaDeviceContext* device_context_ TF_GUARDED_BY(mu_) = nullptr;
  // The device context will allocate memory on fast memory space on TPU.
  // XlaDeviceContext is a ref-counted object.
-  XlaDeviceContext* fast_mem_device_context_ GUARDED_BY(mu_) = nullptr;
+  XlaDeviceContext* fast_mem_device_context_ TF_GUARDED_BY(mu_) = nullptr;
  // Holds extra information for GPU and TPU devices, e.g. the device context.
-  bool use_gpu_device_info_ GUARDED_BY(mu_) = false;
+  bool use_gpu_device_info_ TF_GUARDED_BY(mu_) = false;
-  std::unique_ptr<GpuDeviceInfo> gpu_device_info_ GUARDED_BY(mu_);
+  std::unique_ptr<GpuDeviceInfo> gpu_device_info_ TF_GUARDED_BY(mu_);
  // Thread pool used for running closures
  std::unique_ptr<thread::ThreadPool> thread_pool_;
  // True if the device allows XlaDevice::Sync to be called on completion
  // regardless of status.
-  bool sync_on_completion_ GUARDED_BY(mu_) = true;
+  bool sync_on_completion_ TF_GUARDED_BY(mu_) = true;
  // A callback that will be invoked when RefreshStatus sees a status error.
-  std::function<Status()> device_error_callback_ GUARDED_BY(mu_);
+  std::function<Status()> device_error_callback_ TF_GUARDED_BY(mu_);
  // Set of devices to use. This controls which of the devices on the given
  // platform will have resources allocated. For GPUs this will be
--- a/tensorflow/compiler/jit/xla_device_context.h
+++ b/tensorflow/compiler/jit/xla_device_context.h
@ -117,7 +117,7 @@ class XlaDeviceContext : public DeviceContext {
  bool use_fast_mem_;
  absl::Mutex mu_;
-  int next_stream_ GUARDED_BY(mu_) = 0;
+  int next_stream_ TF_GUARDED_BY(mu_) = 0;
 };
 }  // namespace tensorflow
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@ -18,7 +18,6 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_
 #define TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_
 #include "absl/base/thread_annotations.h"
 #include "tensorflow/compiler/jit/xla_compilation_cache.h"
 #include "tensorflow/compiler/jit/xla_tensor.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
@ -30,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/device_memory_allocator.h"
 namespace tensorflow {
@ -102,7 +102,7 @@ class VariableInfo {
 // `variables` is allowed to contain instances that don't track a resource
 // variable (i.e. variables[i].var() can be null for some i).
 Status LockVariables(absl::Span<VariableInfo> variables)
-    EXCLUSIVE_LOCK_FUNCTION();
+    TF_EXCLUSIVE_LOCK_FUNCTION();
 // Helper class to perform the marshalling of TensorFlow inputs and outputs to
 // ShapedBuffers suitable for passing to an XLA computation.
--- a/tensorflow/compiler/jit/xla_tensor.h
+++ b/tensorflow/compiler/jit/xla_tensor.h
@ -122,7 +122,7 @@ class XlaTensor {
  std::shared_ptr<se::Event> definition_event_;
  // A list of all streams for which the tensor's content is defined for any
  // newly enqueued command.
-  absl::InlinedVector<se::Stream*, 2> streams_defined_on_ GUARDED_BY(mu_);
+  absl::InlinedVector<se::Stream*, 2> streams_defined_on_ TF_GUARDED_BY(mu_);
  mutex mu_;
 };
--- a/tensorflow/compiler/tf2tensorrt/convert/logger_registry.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/logger_registry.cc
@ -47,7 +47,7 @@ class LoggerRegistryImpl : public LoggerRegistry {
 private:
  mutable mutex mu_;
  mutable std::unordered_map<string, std::unique_ptr<nvinfer1::ILogger>>
-      registry_ GUARDED_BY(mu_);
+      registry_ TF_GUARDED_BY(mu_);
 };
 LoggerRegistry* GetLoggerRegistry() {
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc
@ -71,7 +71,7 @@ class CreateTRTResourceHandle : public OpKernel {
  string resource_name_;
  Tensor handle_;
  mutex mutex_;
-  bool initialized_ GUARDED_BY(mutex_) = false;
+  bool initialized_ TF_GUARDED_BY(mutex_) = false;
  TF_DISALLOW_COPY_AND_ASSIGN(CreateTRTResourceHandle);
 };
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
@ -136,7 +136,7 @@ struct EngineContext {
  TrtUniquePtrType<nvinfer1::ICudaEngine> cuda_engine;
  Status GetExecutionContext(int idx, nvinfer1::IExecutionContext** exec_ctx)
-      EXCLUSIVE_LOCKS_REQUIRED(mu) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu) {
    if (idx >= execution_context.size()) {
      return errors::Internal("Requested engine context with index ", idx,
                              ", but only ", execution_context.size(),
@ -152,7 +152,7 @@ struct EngineContext {
  // for inference at a time therefore we need a mutex. More details at
  // https://docs.nvidia.com/deeplearning/sdk/tensorrt-best-practices/index.html#thread-safety
  std::vector<TrtUniquePtrType<nvinfer1::IExecutionContext>> execution_context
-      GUARDED_BY(mu);
+      TF_GUARDED_BY(mu);
 };
 // Contains the context required to build the calibration data.
@ -174,8 +174,8 @@ class CalibrationContext {
 private:
  mutex mu_;
-  bool terminated_ GUARDED_BY(mu_) = false;
+  bool terminated_ TF_GUARDED_BY(mu_) = false;
-  std::string calibration_table_ GUARDED_BY(mu_);
+  std::string calibration_table_ TF_GUARDED_BY(mu_);
 };
 ABSL_CONST_INIT extern const absl::string_view kTfTrtContainerName;
--- a/tensorflow/compiler/tf2xla/xla_op_registry.h
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.h
@ -229,11 +229,11 @@ class XlaOpRegistry {
  };
  // Map from compilation device names to a description of the backend.
-  std::unordered_map<string, Backend> backends_ GUARDED_BY(mutex_);
+  std::unordered_map<string, Backend> backends_ TF_GUARDED_BY(mutex_);
  // Map from Tensorflow device names to the corresponding JIT device metadata.
  std::unordered_map<string, DeviceRegistration> compilation_devices_
-      GUARDED_BY(mutex_);
+      TF_GUARDED_BY(mutex_);
  // A description of a Tensorflow operator that can be compiled to XLA.
  struct OpRegistration {
@ -292,7 +292,7 @@ class XlaOpRegistry {
  // Registrations present under the same key must satisfy IsCompatible above,
  // and this is checked during registration.
  std::unordered_map<string, std::vector<std::unique_ptr<OpRegistration>>> ops_
-      GUARDED_BY(mutex_);
+      TF_GUARDED_BY(mutex_);
  // Have we already registered the JIT kernels on the JIT devices?
  bool jit_kernels_registered_ = false;
@ -301,7 +301,7 @@ class XlaOpRegistry {
  // registrations created by RegisterCompilationKernels() and
  // RegisterDeviceKernels().
  std::vector<std::unique_ptr<kernel_factory::OpKernelRegistrar>>
-      kernel_registrars_ GUARDED_BY(mutex_);
+      kernel_registrars_ TF_GUARDED_BY(mutex_);
 };
 // REGISTER_XLA_OP() registers an XLA OpKernel by name, for example:
--- a/tensorflow/compiler/xla/client/client_library.h
+++ b/tensorflow/compiler/xla/client/client_library.h
@ -134,10 +134,10 @@ class ClientLibrary {
  tensorflow::mutex service_mutex_;  // Guards the singleton creation state.
  std::unordered_map<se::Platform::Id, std::unique_ptr<LocalInstance>>
-      local_instances_ GUARDED_BY(service_mutex_);
+      local_instances_ TF_GUARDED_BY(service_mutex_);
  std::unordered_map<se::Platform::Id, std::unique_ptr<CompileOnlyInstance>>
-      compile_only_instances_ GUARDED_BY(service_mutex_);
+      compile_only_instances_ TF_GUARDED_BY(service_mutex_);
  TF_DISALLOW_COPY_AND_ASSIGN(ClientLibrary);
 };
--- a/tensorflow/compiler/xla/python/event_pool.h
+++ b/tensorflow/compiler/xla/python/event_pool.h
@ -68,7 +68,7 @@ class EventPool {
  const bool allow_reuse_;
  absl::Mutex mu_;
-  std::stack<std::unique_ptr<se::Event>> free_events_ GUARDED_BY(mu_);
+  std::stack<std::unique_ptr<se::Event>> free_events_ TF_GUARDED_BY(mu_);
 };
 }  // namespace xla
--- a/tensorflow/compiler/xla/python/local_client.h
+++ b/tensorflow/compiler/xla/python/local_client.h
@ -237,7 +237,7 @@ class PyLocalBuffer {
  const Shape on_device_shape_;
  const std::shared_ptr<Device> device_;
  mutable absl::Mutex mu_;
-  std::shared_ptr<SharedDeviceBuffer> device_buffer_ GUARDED_BY(mu_);
+  std::shared_ptr<SharedDeviceBuffer> device_buffer_ TF_GUARDED_BY(mu_);
  // The cached value of the buffer on the host, produced either from a call to
  // CopyToHost or from a call to ToLiteral. Once a value has been fetched to
@ -249,7 +249,7 @@ class PyLocalBuffer {
    Status status;
    std::shared_ptr<Literal> value;
  };
-  std::shared_ptr<HostValue> host_value_ GUARDED_BY(mu_);
+  std::shared_ptr<HostValue> host_value_ TF_GUARDED_BY(mu_);
 };
 // Represents a compiled computation that can be executed given handles to
--- a/tensorflow/compiler/xla/python/local_device_state.h
+++ b/tensorflow/compiler/xla/python/local_device_state.h
@ -129,12 +129,12 @@ class LocalDeviceState {
  static constexpr int kNumDeviceToDeviceStreams = 4;
  absl::Mutex mu_;
-  int next_device_to_host_stream_ GUARDED_BY(mu_) = 0;
+  int next_device_to_host_stream_ TF_GUARDED_BY(mu_) = 0;
-  int next_device_to_device_stream_ GUARDED_BY(mu_) = 0;
+  int next_device_to_device_stream_ TF_GUARDED_BY(mu_) = 0;
-  std::random_device prng_seed_device_ GUARDED_BY(mu_);
+  std::random_device prng_seed_device_ TF_GUARDED_BY(mu_);
-  std::mt19937 prng_seed_generator_ GUARDED_BY(mu_);
+  std::mt19937 prng_seed_generator_ TF_GUARDED_BY(mu_);
-  std::uniform_int_distribution<> prng_seed_distribution_ GUARDED_BY(mu_);
+  std::uniform_int_distribution<> prng_seed_distribution_ TF_GUARDED_BY(mu_);
  // Callback stream is used for running short host-side callbacks after device
  // side events, without preventing the device-side stream from doing useful
--- a/tensorflow/compiler/xla/python/shared_device_buffer.h
+++ b/tensorflow/compiler/xla/python/shared_device_buffer.h
@ -66,7 +66,7 @@ class BufferDefinitionEvent {
  void WaitForEventOnStream(se::Stream* stream);
 private:
-  bool EventHasBeenRecorded() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  bool EventHasBeenRecorded() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
  // An event that is triggered when the content of one or more buffers is
  // ready. If this event is nullptr, it is assumed that the buffer's content is
@ -77,7 +77,7 @@ class BufferDefinitionEvent {
  // A list of all streams for which the buffer's content is known to be defined
  // at the tail of the queue, i.e., for any newly enqueued command.
-  absl::InlinedVector<se::Stream*, 2> streams_defined_on_ GUARDED_BY(mu_);
+  absl::InlinedVector<se::Stream*, 2> streams_defined_on_ TF_GUARDED_BY(mu_);
 };
 // Class that represents a node in a reference-counted DAG of device buffers.
--- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.h
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.h
@ -241,8 +241,9 @@ class PyTpuBuffer {
  // `child_buffers_` stores the child buffers; else, `device_buffer_` stores
  // the data content and `child_buffers_` is empty.
  mutable absl::Mutex mu_;
-  std::shared_ptr<TpuSharedBuffer> device_buffer_ GUARDED_BY(mu_);
+  std::shared_ptr<TpuSharedBuffer> device_buffer_ TF_GUARDED_BY(mu_);
-  std::vector<std::shared_ptr<TpuSharedBuffer>> child_buffers_ GUARDED_BY(mu_);
+  std::vector<std::shared_ptr<TpuSharedBuffer>> child_buffers_
      TF_GUARDED_BY(mu_);
  // The cached value of the buffer on the host, produced either from a call to
  // CopyToHost or from a call to ToLiteral. Once a value has been fetched to
  // the host, it persists Delete() is called or the PyTpuBuffer is destroyed.
@ -255,7 +256,7 @@ class PyTpuBuffer {
    Status status;
    std::shared_ptr<Literal> value;
  };
-  std::shared_ptr<HostValue> host_value_ GUARDED_BY(mu_);
+  std::shared_ptr<HostValue> host_value_ TF_GUARDED_BY(mu_);
 };
 // Represents a compiled computation that can be executed given handles to
--- a/tensorflow/compiler/xla/python/tpu_driver/tpu_driver.h
+++ b/tensorflow/compiler/xla/python/tpu_driver/tpu_driver.h
@ -23,7 +23,6 @@
 #include <string>
 #include <vector>
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/synchronization/mutex.h"
--- a/tensorflow/compiler/xla/python/worker_thread.h
+++ b/tensorflow/compiler/xla/python/worker_thread.h
@ -40,11 +40,11 @@ class WorkerThread {
  void Schedule(std::function<void()> fn);
 private:
-  bool WorkAvailable() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  bool WorkAvailable() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
  void WorkLoop();
  absl::Mutex mu_;
-  std::queue<std::function<void()>> work_queue_ GUARDED_BY(mu_);
+  std::queue<std::function<void()>> work_queue_ TF_GUARDED_BY(mu_);
  std::unique_ptr<tensorflow::Thread> thread_;
 };
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@ -73,7 +73,7 @@ namespace {
 struct Uniquer {
  absl::Mutex mu;
-  NameUniquer name_uniquer GUARDED_BY(mu);
+  NameUniquer name_uniquer TF_GUARDED_BY(mu);
 };
 Uniquer* GetUniquer() {
--- a/tensorflow/compiler/xla/service/allocation_tracker.h
+++ b/tensorflow/compiler/xla/service/allocation_tracker.h
@ -87,7 +87,7 @@ class AllocationTracker {
  // Internal helper which resolves the given GlobalDataHandle to a
  // list of ScopedShapedBuffers.
  StatusOr<std::vector<const ShapedBuffer*>> ResolveInternal(
-      const GlobalDataHandle& data) const EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+      const GlobalDataHandle& data) const TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
  // Internal helper which registers a vector of shaped buffers, one per
  // replica.  ShapedBufferTy is either ScopedShapedBuffer or ShapedBuffer.  If
@ -96,18 +96,19 @@ class AllocationTracker {
  template <typename ShapedBufferTy>
  StatusOr<GlobalDataHandle> RegisterInternal(
      std::vector<ShapedBufferTy> replicated_buffers, const string& tag)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
  // Adds the given device address to the allocation tracker, or if it already
  // exists, then increment its reference count.
  void AddAllocationOrIncrementRefCount(se::DeviceMemoryBase device_memory,
                                        int device_ordinal)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
  // Decrements the reference count of the given device memory. Then, if it is
  // zero, deallocate the memory.
  Status DecrementRefCount(se::DeviceMemoryBase device_memory,
-                           int device_ordinal) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+                           int device_ordinal)
      TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
  // A map from device memory opaque value to allocation. One such map is
  // maintained per device ordinal.
@ -121,11 +122,11 @@ class AllocationTracker {
  // The next handle to assign to an allocation, guarded by the same mutex as
  // the mapping as they'll be mutated at the same time.
-  int64 next_handle_ GUARDED_BY(mutex_);
+  int64 next_handle_ TF_GUARDED_BY(mutex_);
  // A map from device ordinal to AllocationMap.
  absl::flat_hash_map<int, AllocationMap> opaque_to_allocation_map_
-      GUARDED_BY(mutex_);
+      TF_GUARDED_BY(mutex_);
  // A map from data handle to a vector of shaped buffers that represent the
  // buffers for different replicas.
@ -145,7 +146,7 @@ class AllocationTracker {
  // free'd when both the view *and* the original tuple are Unregistered.  This
  // refcounting is managed in opaque_to_allocation_map_.
  absl::flat_hash_map<int64, std::vector<std::unique_ptr<ShapedBuffer>>>
-      handle_to_shaped_buffers_ GUARDED_BY(mutex_);
+      handle_to_shaped_buffers_ TF_GUARDED_BY(mutex_);
  TF_DISALLOW_COPY_AND_ASSIGN(AllocationTracker);
 };
--- a/tensorflow/compiler/xla/service/backend.h
+++ b/tensorflow/compiler/xla/service/backend.h
@ -176,7 +176,7 @@ class Backend {
  // Mapping from stream executor to stream pools, used by `BorrowStream` above.
  absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<StreamPool>>
-      stream_pools_ GUARDED_BY(mu_);
+      stream_pools_ TF_GUARDED_BY(mu_);
  // The default memory allocator to use.
  std::unique_ptr<se::StreamExecutorMemoryAllocator> memory_allocator_;
--- a/tensorflow/compiler/xla/service/channel_tracker.h
+++ b/tensorflow/compiler/xla/service/channel_tracker.h
@ -68,24 +68,24 @@ class ChannelTracker {
  // Bumps the next_channel_ number and returns the allocated number
  // wrapped in a ChannelHandle.
  ChannelHandle AllocateHandle(ChannelHandle::ChannelType type)
-      EXCLUSIVE_LOCKS_REQUIRED(channel_mutex_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(channel_mutex_);
  Status RegisterSendInternal(const ChannelHandle& handle)
-      EXCLUSIVE_LOCKS_REQUIRED(channel_mutex_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(channel_mutex_);
  Status RegisterRecvInternal(const ChannelHandle& handle)
-      EXCLUSIVE_LOCKS_REQUIRED(channel_mutex_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(channel_mutex_);
  // Guards the channel mapping.
  tensorflow::mutex channel_mutex_;
  // The next sequence number to assign to a channel.
-  int64 next_channel_ GUARDED_BY(channel_mutex_);
+  int64 next_channel_ TF_GUARDED_BY(channel_mutex_);
  // Mapping from ChannelHandle value to the corresponding registered
  // Channel object.
  absl::flat_hash_map<int64, Channel> opaque_to_channel_
-      GUARDED_BY(channel_mutex_);
+      TF_GUARDED_BY(channel_mutex_);
  TF_DISALLOW_COPY_AND_ASSIGN(ChannelTracker);
 };
--- a/tensorflow/compiler/xla/service/collective_ops_utils.h
+++ b/tensorflow/compiler/xla/service/collective_ops_utils.h
@ -241,9 +241,9 @@ class Rendezvous {
  tensorflow::mutex mu_;
-  bool initialized_ GUARDED_BY(mu_) = false;
+  bool initialized_ TF_GUARDED_BY(mu_) = false;
-  std::vector<AllReduceParticipantData> participants_ GUARDED_BY(mu_);
+  std::vector<AllReduceParticipantData> participants_ TF_GUARDED_BY(mu_);
 private:
  // Runs the all-reduce on the given thread.  If successful, returns
--- a/tensorflow/compiler/xla/service/compilation_cache.h
+++ b/tensorflow/compiler/xla/service/compilation_cache.h
@ -51,7 +51,7 @@ class CompilationCache {
  using CacheKey = int64;
  absl::flat_hash_map<CacheKey, std::shared_ptr<Executable>> cache_
-      GUARDED_BY(mutex_);
+      TF_GUARDED_BY(mutex_);
 private:
  TF_DISALLOW_COPY_AND_ASSIGN(CompilationCache);
--- a/tensorflow/compiler/xla/service/cpu/orc_jit_memory_mapper.cc
+++ b/tensorflow/compiler/xla/service/cpu/orc_jit_memory_mapper.cc
@ -23,7 +23,7 @@ namespace orc_jit_memory_mapper {
 static tensorflow::mutex mapper_instance_mutex(tensorflow::LINKER_INITIALIZED);
 static llvm::SectionMemoryManager::MemoryMapper* mapper_instance
-    GUARDED_BY(mapper_instance_mutex) = nullptr;
+    TF_GUARDED_BY(mapper_instance_mutex) = nullptr;
 llvm::SectionMemoryManager::MemoryMapper* GetInstance() {
  tensorflow::mutex_lock lock(mapper_instance_mutex);
--- a/tensorflow/compiler/xla/service/dump.cc
+++ b/tensorflow/compiler/xla/service/dump.cc
@ -274,7 +274,7 @@ static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
 // dies.  But we only add an entry if dumping is enabled for this module, and
 // dumping a module leaks buffer space in stdout or bytes on disk *way* faster
 // than this hashtable leaks memory.
-static auto& module_id_to_step_number GUARDED_BY(mu) =
+static auto& module_id_to_step_number TF_GUARDED_BY(mu) =
    *new absl::flat_hash_map<int64, int64>();
 // Maps a module's unique ID to a timestamp indicating when we've first dumped
@ -285,7 +285,7 @@ static auto& module_id_to_step_number GUARDED_BY(mu) =
 // dies.  But we only add an entry if dumping is enabled for this module, and
 // dumping a module leaks buffer space in stdout or bytes on disk *way* faster
 // than this hashtable leaks memory.
-static auto& module_id_to_timestamp GUARDED_BY(mu) =
+static auto& module_id_to_timestamp TF_GUARDED_BY(mu) =
    *new absl::flat_hash_map<int64, uint64>();
 int64 StepNumberForModule(const HloModule& module) {
@ -432,7 +432,7 @@ void DumpHloSnapshotIfEnabled(const HloModule& module,
  int64 execution_count;
  uint64 timestamp;
  {
-    static auto& module_id_to_execution_count GUARDED_BY(mu) =
+    static auto& module_id_to_execution_count TF_GUARDED_BY(mu) =
        *new absl::flat_hash_map<int64, int64>();
    tensorflow::mutex_lock lock(mu);
    execution_count = module_id_to_execution_count[module.unique_id()]++;
@ -469,7 +469,7 @@ void DumpHloSnapshotIfEnabled(const HloSnapshot& snapshot,
  // have to use its name.
  int64 execution_count;
  {
-    static auto& module_name_to_execution_count GUARDED_BY(mu) =
+    static auto& module_name_to_execution_count TF_GUARDED_BY(mu) =
        *new absl::flat_hash_map<string, int64>();
    tensorflow::mutex_lock lock(mu);
    execution_count = module_name_to_execution_count[name]++;
--- a/tensorflow/compiler/xla/service/execution_tracker.h
+++ b/tensorflow/compiler/xla/service/execution_tracker.h
@ -86,12 +86,12 @@ class ExecutionTracker {
 private:
  // The next handle to assign to an execution.
-  int64 next_handle_ GUARDED_BY(execution_mutex_);
+  int64 next_handle_ TF_GUARDED_BY(execution_mutex_);
  // Mapping from ExecutionHandle handle to the corresponding registered
  // AsyncExecution object.
  std::map<int64, std::unique_ptr<AsyncExecution>> handle_to_execution_
-      GUARDED_BY(execution_mutex_);
+      TF_GUARDED_BY(execution_mutex_);
  tensorflow::mutex execution_mutex_;  // Guards the execution mapping.
--- a/tensorflow/compiler/xla/service/gpu/cholesky_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/cholesky_thunk.h
@ -16,7 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CHOLESKY_THUNK_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CHOLESKY_THUNK_H_
 #include "absl/base/thread_annotations.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
@ -29,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/blas.h"
 namespace xla {
@ -66,7 +66,8 @@ class CholeskyThunk : public Thunk {
  const int64 n_;
  tensorflow::mutex mu_;
-  absl::flat_hash_map<se::Stream*, CusolverContext> contexts_ GUARDED_BY(mu_);
+  absl::flat_hash_map<se::Stream*, CusolverContext> contexts_
      TF_GUARDED_BY(mu_);
 };
 }  // namespace gpu
--- a/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc
@ -127,12 +127,12 @@ class Rendezvous {
      std::make_shared<BlockingCounter>(key_.num_participants)};
  tensorflow::mutex mu_;
-  bool initialized_ GUARDED_BY(mu_) = false;
+  bool initialized_ TF_GUARDED_BY(mu_) = false;
  // We use an std::map so that we can iterate over it below in a guaranteed
  // order.  The order shouldn't actually matter, but why be nondeterministic if
  // we don't have to be?
-  std::map<int64, ParticipantData> participants_ GUARDED_BY(mu_);
+  std::map<int64, ParticipantData> participants_ TF_GUARDED_BY(mu_);
 };
 void EnqueueCopy(se::DeviceMemoryBase src, se::Stream* src_stream,
--- a/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.cc
@ -45,11 +45,11 @@ using GemmCacheKey =
    std::tuple<se::StreamExecutor*, Shape, Shape, Shape, std::string>;
 static tensorflow::mutex autotune_cache_mu(tensorflow::LINKER_INITIALIZED);
-static auto& autotune_cache GUARDED_BY(autotune_cache_mu) =
+static auto& autotune_cache TF_GUARDED_BY(autotune_cache_mu) =
    *new absl::flat_hash_map<GemmCacheKey,
                             absl::optional<se::blas::AlgorithmType>>();
-static int64 cache_hits GUARDED_BY(autotune_cache_mu) = 0;
+static int64 cache_hits TF_GUARDED_BY(autotune_cache_mu) = 0;
-static int64 cache_misses GUARDED_BY(autotune_cache_mu) = 0;
+static int64 cache_misses TF_GUARDED_BY(autotune_cache_mu) = 0;
 // Experimentally tries to pick the best algorithm for the given gemm.
 //
--- a/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
@ -268,9 +268,9 @@ ConvCacheKey AutotuneCacheKeyfromInstruction(
 }
 tensorflow::mutex autotune_cache_lock(tensorflow::LINKER_INITIALIZED);
-auto& autotune_cache GUARDED_BY(autotune_cache_lock) =
+auto& autotune_cache TF_GUARDED_BY(autotune_cache_lock) =
    *new absl::flat_hash_map<ConvCacheKey, AutotuneResult>();
-auto& autotune_cache_stats GUARDED_BY(autotune_cache_lock) =
+auto& autotune_cache_stats TF_GUARDED_BY(autotune_cache_lock) =
    *new ConvCacheStats();
 }  // anonymous namespace
--- a/tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager.h
@ -126,17 +126,17 @@ class GpuDebugInfoManager {
  };
  tensorflow::mutex mutex_;
-  bool tracing_active_ GUARDED_BY(mutex_) = false;
+  bool tracing_active_ TF_GUARDED_BY(mutex_) = false;
  // Modules that was running currently. Because multiple instances of the
  // modules can be running in the same time, a reference count is maintained
  // as map value.
  absl::flat_hash_map<ModuleIdentifier, int> running_module_ids_
-      GUARDED_BY(mutex_);
+      TF_GUARDED_BY(mutex_);
  // Active modules are those still tracked by us. There could be much more
  // active modules than running modules, we will try to reduce the trace size
  // by only transfer those modules that were running during tracing period.
  absl::flat_hash_map<ModuleIdentifier, GpuModuleEntry> active_modules_
-      GUARDED_BY(mutex_);
+      TF_GUARDED_BY(mutex_);
 };
 }  // namespace gpu
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@ -159,9 +159,9 @@ class GpuExecutable : public Executable {
  // `ResolveConstantGlobals`.
  tensorflow::mutex module_handle_mutex_;
  std::map<stream_executor::StreamExecutor*, se::ScopedModuleHandle>
-      module_handles_ GUARDED_BY(module_handle_mutex_);
+      module_handles_ TF_GUARDED_BY(module_handle_mutex_);
  std::map<stream_executor::StreamExecutor*, BufferAllocToDeviceMemoryMap>
-      module_globals_ GUARDED_BY(module_handle_mutex_);
+      module_globals_ TF_GUARDED_BY(module_handle_mutex_);
  TF_DISALLOW_COPY_AND_ASSIGN(GpuExecutable);
 };
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
@ -84,7 +84,7 @@ class KernelThunk : public Thunk {
  // Loaded kernels for each `StreamExecutor`.  Requires pointer stability of
  // values.
  std::unordered_map<se::StreamExecutor*, std::unique_ptr<se::KernelBase>>
-      kernel_cache_ GUARDED_BY(mutex_);
+      kernel_cache_ TF_GUARDED_BY(mutex_);
 };
 }  // namespace gpu
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
@ -492,7 +492,7 @@ void RendezvousNcclAllReduce::CleanupImpl(std::shared_ptr<NcclClique> handle,
 // lives, which is how we avoid expensive reinitialization of NCCL cliques.
 struct NcclAllReduceThunk::AuxData {
  tensorflow::mutex mu;
-  absl::flat_hash_set<std::shared_ptr<NcclClique>> cliques GUARDED_BY(mu);
+  absl::flat_hash_set<std::shared_ptr<NcclClique>> cliques TF_GUARDED_BY(mu);
 };
 /*static*/ bool NcclAllReduceThunk::CanImplement(const HloInstruction* crs) {
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
@ -62,8 +62,8 @@ class NVPTXCompiler : public GpuCompiler {
  // We cache the cuda_data_dir() and the result of our search, so that if the
  // next module we have to compile has the same cuda_data_dir(), we can skip
  // the search.
-  string cached_cuda_data_dir_ GUARDED_BY(mutex_);
+  string cached_cuda_data_dir_ TF_GUARDED_BY(mutex_);
-  string cached_libdevice_dir_ GUARDED_BY(mutex_);
+  string cached_libdevice_dir_ TF_GUARDED_BY(mutex_);
  // Tries to compile the given ptx string to cubin.  Returns a vector with the
  // compiled cubin.  If compilation was unsuccessful, returns an empty vector.
@ -116,7 +116,7 @@ class NVPTXCompiler : public GpuCompiler {
  // is critical here.
  absl::node_hash_map<CompilationCacheKey, CompilationCacheValue,
                      CompilationCacheHash, CompilationCacheEq>
-      compilation_cache_ GUARDED_BY(mutex_);
+      compilation_cache_ TF_GUARDED_BY(mutex_);
  TF_DISALLOW_COPY_AND_ASSIGN(NVPTXCompiler);
 };
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@ -1557,7 +1557,7 @@ string WrapDotInHtml(absl::string_view dot) {
 tensorflow::mutex url_renderer_mu(tensorflow::LINKER_INITIALIZED);
 std::function<StatusOr<string>(absl::string_view)>* url_renderer
-    GUARDED_BY(url_renderer_mu) = nullptr;
+    TF_GUARDED_BY(url_renderer_mu) = nullptr;
 // Precondition: url_renderer != nullptr.
 //
@ -1567,7 +1567,7 @@ std::function<StatusOr<string>(absl::string_view)>* url_renderer
 // of producing dot for the graph.)
 StatusOr<string> WrapDotInFormat(absl::string_view dot,
                                 RenderedGraphFormat format)
-    EXCLUSIVE_LOCKS_REQUIRED(url_renderer_mu) {
+    TF_EXCLUSIVE_LOCKS_REQUIRED(url_renderer_mu) {
  switch (format) {
    case RenderedGraphFormat::kUrl:
      CHECK(url_renderer != nullptr)
--- a/tensorflow/compiler/xla/service/interpreter/executable.h
+++ b/tensorflow/compiler/xla/service/interpreter/executable.h
@ -52,13 +52,13 @@ class InterpreterExecutable : public Executable {
      const ServiceExecutableRunOptions* run_options,
      std::vector<ExecutionInput> arguments,
      HloExecutionProfile* hlo_execution_profile) override
-      LOCKS_EXCLUDED(evaluator_lock_);
+      TF_LOCKS_EXCLUDED(evaluator_lock_);
  static int64 ShapeSizeBytes(const Shape& shape);
 protected:
  // The interpreter interprets executables with an HloEvaluator.
-  std::unique_ptr<HloEvaluator> evaluator_ PT_GUARDED_BY(evaluator_lock_);
+  std::unique_ptr<HloEvaluator> evaluator_ TF_PT_GUARDED_BY(evaluator_lock_);
  mutable tensorflow::mutex evaluator_lock_;
 private:
--- a/tensorflow/compiler/xla/service/stream_pool.h
+++ b/tensorflow/compiler/xla/service/stream_pool.h
@ -56,7 +56,7 @@ class StreamPool {
  void ReturnStream(se::Stream* stream);
  tensorflow::mutex mu_;
-  std::vector<std::unique_ptr<se::Stream>> streams_ GUARDED_BY(mu_);
+  std::vector<std::unique_ptr<se::Stream>> streams_ TF_GUARDED_BY(mu_);
 };
 }  // namespace xla
--- a/tensorflow/compiler/xla/tests/local_client_test_base.h
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.h
@ -66,12 +66,12 @@ class TestAllocator : public se::StreamExecutorMemoryAllocator {
  mutable tensorflow::mutex count_mutex_;
  // Global counts of allocations and deallocations.
-  int64 allocation_count_ GUARDED_BY(count_mutex_) = 0;
+  int64 allocation_count_ TF_GUARDED_BY(count_mutex_) = 0;
-  int64 deallocation_count_ GUARDED_BY(count_mutex_) = 0;
+  int64 deallocation_count_ TF_GUARDED_BY(count_mutex_) = 0;
  // Per-device counts of allocations and deallocations.
-  std::map<int, int64> device_allocation_count_ GUARDED_BY(count_mutex_);
+  std::map<int, int64> device_allocation_count_ TF_GUARDED_BY(count_mutex_);
-  std::map<int, int64> device_deallocation_count_ GUARDED_BY(count_mutex_);
+  std::map<int, int64> device_deallocation_count_ TF_GUARDED_BY(count_mutex_);
 };
 // A base class for tests which exercise the LocalClient interface.
--- a/tensorflow/compiler/xrt/xrt_compilation_cache.h
+++ b/tensorflow/compiler/xrt/xrt_compilation_cache.h
@ -173,11 +173,11 @@ class XRTCompilationCache : public ResourceBase {
  // last reference to entry is released, entry is removed from cache_.
  void DiscardEntryRef(CompiledSubgraph* entry);
  void DiscardEntryRefLocked(CompiledSubgraph* entry)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
  // Marks the oldest unmarked entry for eviction. Requires that there is at
  // least one such entry.
-  void MarkOldestEntryForEviction() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  void MarkOldestEntryForEviction() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
  // Updates datastructures to indicate that entry, which had been marked for
  // eviction, has been looked up. This is called by CompileIfKeyAbsent when an
@ -195,7 +195,7 @@ class XRTCompilationCache : public ResourceBase {
  // is never marked for eviction, so an entry larger than the max cache entries
  // will remain in the cache until it is replaced by something else.
  void LookupEntryMarkedForEviction(CompiledSubgraph* entry)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
  // Creates a new entry by running initialize_program and places it in the
  // cache to be looked up by key. The new entry is in the 'marked for eviction'
@ -206,7 +206,7 @@ class XRTCompilationCache : public ResourceBase {
  CompiledSubgraph* InitializeEntry(
      const string& key,
      const std::function<Status(std::unique_ptr<xla::LocalExecutable>*)>&
-          initialize_program) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+          initialize_program) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
  // The maximum number of entries that are stored in the cache before entries
  // are marked for eviction.
@ -214,23 +214,24 @@ class XRTCompilationCache : public ResourceBase {
  mutable absl::Mutex mu_;
  // The total number of entries that are stored and not marked for eviction.
-  int cache_entries_ GUARDED_BY(mu_) = 0;
+  int cache_entries_ TF_GUARDED_BY(mu_) = 0;
  // The total number of entries that are marked for eviction.
-  int marked_for_eviction_entries_ GUARDED_BY(mu_) = 0;
+  int marked_for_eviction_entries_ TF_GUARDED_BY(mu_) = 0;
  // The value to assign to the last_use field of the next entry that is looked
  // up.
-  int64 use_counter_ GUARDED_BY(mu_) = 0;
+  int64 use_counter_ TF_GUARDED_BY(mu_) = 0;
  // All the executables that can be looked up in the cache index by key. An
  // entry is marked for eviction iff it is present in cache_ and not in
  // entries_by_last_use_.
-  std::unordered_map<string, CompiledSubgraph*> cache_ GUARDED_BY(mu_);
+  std::unordered_map<string, CompiledSubgraph*> cache_ TF_GUARDED_BY(mu_);
  // All the executable entries that can be looked up in the cache indexed by
  // uid.
-  std::unordered_map<int64, CompiledSubgraph*> entries_by_uid_ GUARDED_BY(mu_);
+  std::unordered_map<int64, CompiledSubgraph*> entries_by_uid_
      TF_GUARDED_BY(mu_);
  // Map from last_use to entry, used to mark entries for eviction in LRU
  // order. If an entry's last_use counter is not present as a key in
  // entries_by_last_use_ then the entry has been marked for eviction.
-  std::map<int64, CompiledSubgraph*> entries_by_last_use_ GUARDED_BY(mu_);
+  std::map<int64, CompiledSubgraph*> entries_by_last_use_ TF_GUARDED_BY(mu_);
 };
 // Looks up or create an XRTCompilationCache object within the given resource
--- a/tensorflow/core/common_runtime/base_collective_executor.h
+++ b/tensorflow/core/common_runtime/base_collective_executor.h
@ -165,14 +165,14 @@ class BaseCollectiveExecutor : public CollectiveExecutor {
  condition_variable launch_cv_;
  // collective instance key -> number of local devices for which NCCL ops have
  // been launched.
-  std::unordered_map<int32, int32> launched_ GUARDED_BY(launch_mu_);
+  std::unordered_map<int32, int32> launched_ TF_GUARDED_BY(launch_mu_);
 private:
  Status CreateCollective(const CollectiveParams& col_params,
                          CollectiveImplementationInterface** col_impl);
  // Check if all ops on which this collective depends on have launched.
  bool CheckDependencies(const CollectiveParams& col_params)
-      EXCLUSIVE_LOCKS_REQUIRED(launch_mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(launch_mu_);
 };
 }  // namespace tensorflow
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@ -269,7 +269,7 @@ size_t BFCAllocator::RoundedBytes(size_t bytes) {
 }
 bool BFCAllocator::DeallocateFreeRegions(size_t rounded_bytes)
-    EXCLUSIVE_LOCKS_REQUIRED(lock_) {
+    TF_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
  // Do nothing if garbage collection is off.
  if (!garbage_collection_) {
    return false;
@ -326,7 +326,7 @@ bool BFCAllocator::DeallocateFreeRegions(size_t rounded_bytes)
 void BFCAllocator::DeallocateRegions(
    const absl::flat_hash_set<void*>& region_ptrs)
-    EXCLUSIVE_LOCKS_REQUIRED(lock_) {
+    TF_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
  // Explicitly remove the const qualifier as some compilers disallow passing
  // const_iterator to std::vector::erase(), which is used in
  // RemoveAllocationRegion().
@ -450,7 +450,7 @@ void BFCAllocator::AddTraceMe(absl::string_view traceme_name,
 #endif
  tensorflow::profiler::TraceMe trace_me(
-      [&]() EXCLUSIVE_LOCKS_REQUIRED(lock_) {
+      [&]() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
        AllocatorStats stats = stats_;
        int64 bytes_available =
            memory_limit_ - stats.bytes_reserved - stats.bytes_in_use;
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@ -113,12 +113,12 @@ class BFCAllocator : public Allocator {
  // constituents so they're only useful for allocations not requiring a
  // particular timestamp.
  bool MergeTimestampedChunks(size_t required_bytes)
-      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
  // Add TraceMe (in memory allocation and deallocation) for memory stats
  // profiling. The requested_bytes can be negative if it's a deallocation.
  void AddTraceMe(absl::string_view traceme_name, int64 requested_bytes)
-      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
  // A ChunkHandle is an index into the chunks_ vector in BFCAllocator
  // kInvalidChunkHandle means an invalid chunk
@ -186,7 +186,7 @@ class BFCAllocator : public Allocator {
 #endif
    string DebugString(BFCAllocator* a,
-                       bool recurse) NO_THREAD_SAFETY_ANALYSIS {
+                       bool recurse) TF_NO_THREAD_SAFETY_ANALYSIS {
      string dbg;
      strings::StrAppend(
          &dbg, "  Size: ", strings::HumanReadableNumBytes(size),
@ -221,7 +221,7 @@ class BFCAllocator : public Allocator {
          : allocator_(allocator) {}
      // Sort first by size and then use pointer address as a tie breaker.
      bool operator()(const ChunkHandle ha,
-                      const ChunkHandle hb) const NO_THREAD_SAFETY_ANALYSIS {
+                      const ChunkHandle hb) const TF_NO_THREAD_SAFETY_ANALYSIS {
        const Chunk* a = allocator_->ChunkFromHandle(ha);
        const Chunk* b = allocator_->ChunkFromHandle(hb);
        if (a->size != b->size) {
@ -382,7 +382,7 @@ class BFCAllocator : public Allocator {
  // 'rounded_bytes' bytes.  Returns true on success and false on
  // failure.
  bool Extend(size_t alignment, size_t rounded_bytes)
-      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
  // Deallocate free regions to give back the memory to suballocator, so that
  // we can re-allocate a larger region.  The main use scenario of this function
@ -394,58 +394,58 @@ class BFCAllocator : public Allocator {
  // Helper function to deallocate regions.
  void DeallocateRegions(const absl::flat_hash_set<void*>& region_ptrs)
-      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
  // Returns a pointer to an underlying allocated chunk of size
  // 'rounded_bytes'.
  void* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes,
-                     uint64 freed_before) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+                     uint64 freed_before) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
  // Splits the chunk specified by 'h' into two chunks, one at least
  // of size 'num_bytes'.
  void SplitChunk(ChunkHandle h, size_t num_bytes)
-      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
  // Merges the two chunk handles.  Requires that the chunks are
  // contiguous in their allocation.
-  void Merge(ChunkHandle h, ChunkHandle h2) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  void Merge(ChunkHandle h, ChunkHandle h2) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
  // Adds the chunk 'h' to the proper free bin.
-  void InsertFreeChunkIntoBin(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  void InsertFreeChunkIntoBin(ChunkHandle h) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
  // Removes the free chunk pointed to by 'c' from the set free_chunks.
  void RemoveFreeChunkIterFromBin(Bin::FreeChunkSet* free_chunks,
                                  const Bin::FreeChunkSet::iterator& c)
-      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
  // Removes a free chunk from the bin.
-  void RemoveFreeChunkFromBin(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  void RemoveFreeChunkFromBin(ChunkHandle h) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
  void MaybeRemoveFreeChunkFromBin(ChunkHandle h)
-      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
  // Removes the chunk metadata represented by 'h'.
-  void DeleteChunk(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  void DeleteChunk(ChunkHandle h) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
-  string RenderOccupancy() EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  string RenderOccupancy() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
-  void DumpMemoryLog(size_t num_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  void DumpMemoryLog(size_t num_bytes) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
-  MemoryDump RecordMemoryMapInternal() EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  MemoryDump RecordMemoryMapInternal() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
-  void MaybeWriteMemoryMap() EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  void MaybeWriteMemoryMap() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
-  ChunkHandle AllocateChunk() EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  ChunkHandle AllocateChunk() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
-  void DeallocateChunk(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  void DeallocateChunk(ChunkHandle h) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
-  Chunk* ChunkFromHandle(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  Chunk* ChunkFromHandle(ChunkHandle h) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
  const Chunk* ChunkFromHandle(ChunkHandle h) const
-      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
-  void MarkFree(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  void MarkFree(ChunkHandle h) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
  ChunkHandle TryToCoalesce(ChunkHandle h, bool ignore_freed_at)
-      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
  // Fragmentation is calculated as the reverse ratio of the largest free chunk
  // size over total free memory, and returns a value within [0, 1].
-  double GetFragmentation() EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  double GetFragmentation() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
  // Information about a Bin that is useful for debugging.
  struct BinDebugInfo {
@ -458,7 +458,7 @@ class BFCAllocator : public Allocator {
  // Computes and returns a BinDebugInfo for each Bin.
  std::array<BinDebugInfo, kNumBins> get_bin_debug_info()
-      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
  AllocatorRetry retry_helper_;
@ -526,21 +526,21 @@ class BFCAllocator : public Allocator {
  // Structures mutable after construction
  mutable mutex lock_;
-  RegionManager region_manager_ GUARDED_BY(lock_);
+  RegionManager region_manager_ TF_GUARDED_BY(lock_);
-  std::vector<Chunk> chunks_ GUARDED_BY(lock_);
+  std::vector<Chunk> chunks_ TF_GUARDED_BY(lock_);
  // Pointer to head of linked list of free Chunks
-  ChunkHandle free_chunks_list_ GUARDED_BY(lock_);
+  ChunkHandle free_chunks_list_ TF_GUARDED_BY(lock_);
  // Counter containing the next unique identifier to assign to a
  // newly-created chunk.
-  int64 next_allocation_id_ GUARDED_BY(lock_);
+  int64 next_allocation_id_ TF_GUARDED_BY(lock_);
  // Stats.
-  AllocatorStats stats_ GUARDED_BY(lock_);
+  AllocatorStats stats_ TF_GUARDED_BY(lock_);
 #ifdef TENSORFLOW_MEM_DEBUG
-  int64 action_counter_ GUARDED_BY(lock_);
+  int64 action_counter_ TF_GUARDED_BY(lock_);
 #define MEM_DEBUG_SIZE_HISTORY_SIZE 4096
  int64 size_history_[MEM_DEBUG_SIZE_HISTORY_SIZE];
 #endif
--- a/tensorflow/core/common_runtime/buf_rendezvous.h
+++ b/tensorflow/core/common_runtime/buf_rendezvous.h
@ -103,9 +103,9 @@ class BufRendezvous {
  const uint64 step_id_;
  const DeviceMgr* const dev_mgr_;  // Not owned.
  mutex mu_;
-  Status status_ GUARDED_BY(mu_);
+  Status status_ TF_GUARDED_BY(mu_);
  typedef absl::flat_hash_map<string, Hook*> HookTable;
-  HookTable hook_table_ GUARDED_BY(mu_);
+  HookTable hook_table_ TF_GUARDED_BY(mu_);
  void PurgeTable(const Status& s, HookTable* table);
 };
--- a/tensorflow/core/common_runtime/collective_executor_mgr.h
+++ b/tensorflow/core/common_runtime/collective_executor_mgr.h
@ -72,7 +72,8 @@ class CollectiveExecutorMgr : public CollectiveExecutorMgrInterface {
 private:
  mutex exec_mu_;
  // Map from step_id to CollectiveExecutor
-  gtl::FlatMap<int64, CollectiveExecutor*> executor_table_ GUARDED_BY(exec_mu_);
+  gtl::FlatMap<int64, CollectiveExecutor*> executor_table_
      TF_GUARDED_BY(exec_mu_);
 };
 }  // namespace tensorflow
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@ -507,7 +507,7 @@ void CollectiveParamResolverLocal::InitInstanceSharedParams(
      ir->shared.instance.task_names,    // NOLINT
      attributes,
      [this, gr, cp, ir, attributes, done](const Status& s)
-          EXCLUSIVE_LOCK_FUNCTION(ir->out_mu) {
+          TF_EXCLUSIVE_LOCK_FUNCTION(ir->out_mu) {
            // Then we recover the lock in the callback thread that will hold it
            // through the rest of the call chain.  Signal the cv now, any
            // waiting threads will wake only when out_mu is released later.
@ -607,7 +607,7 @@ void CollectiveParamResolverLocal::FindInstanceRec(
 void CollectiveParamResolverLocal::CallInitInstanceSharedParams(
    const GroupRec* gr, const CollectiveParams* cp, InstanceRec* ir,
-    const InstanceRecCallback& done) NO_THREAD_SAFETY_ANALYSIS {
+    const InstanceRecCallback& done) TF_NO_THREAD_SAFETY_ANALYSIS {
  // This function serves merely to make a function call that should
  // be thread/mutex safe but violates the simple model applied by
  // static analysis, so we turn off analysis only within this
@ -630,7 +630,7 @@ void CollectiveParamResolverLocal::CallInitInstanceSharedParams(
  ir->known.resize(cp->group.group_size, false);
  InitInstanceSharedParams(
      gr, cp, ir,
-      [this, ir, done](const Status& s) UNLOCK_FUNCTION(ir->out_mu) {
+      [this, ir, done](const Status& s) TF_UNLOCK_FUNCTION(ir->out_mu) {
        DCHECK(ir->out_mu_available);
        ir->status.Update(s);
        ir->out_mu.unlock();
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.h
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.h
@ -66,12 +66,12 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
  struct GroupRec {
    CollGroupParams group;
    mutable mutex mu;
-    Status status GUARDED_BY(mu);
+    Status status TF_GUARDED_BY(mu);
-    std::set<string> device_set GUARDED_BY(mu);
+    std::set<string> device_set TF_GUARDED_BY(mu);
-    std::vector<string> device_list GUARDED_BY(mu);
+    std::vector<string> device_list TF_GUARDED_BY(mu);
-    std::set<string> task_set GUARDED_BY(mu);
+    std::set<string> task_set TF_GUARDED_BY(mu);
-    std::vector<string> task_list GUARDED_BY(mu);
+    std::vector<string> task_list TF_GUARDED_BY(mu);
-    std::vector<StatusCallback> waiting GUARDED_BY(mu);
+    std::vector<StatusCallback> waiting TF_GUARDED_BY(mu);
  };
  // Finds the GroupRec that corresponds to cp->group_key.
@ -84,7 +84,7 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
      GroupRecCallback;
  void CompleteGroupLocal(const string& device, CollectiveParams* cp,
                          const GroupRecCallback& done)
-      LOCKS_EXCLUDED(group_mu_);
+      TF_LOCKS_EXCLUDED(group_mu_);
  // Used to complete/verify CollInstance.
  struct InstanceRec;
@ -116,29 +116,29 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
    // drop that lock, then take a lock on out_mu before
    // reading/modifying its values.
    mutex in_mu;
-    bool is_init GUARDED_BY(in_mu);
+    bool is_init TF_GUARDED_BY(in_mu);
-    std::vector<IRConsumer> init_waiters GUARDED_BY(in_mu);
+    std::vector<IRConsumer> init_waiters TF_GUARDED_BY(in_mu);
    // A thread that wishes to acquire out_mu must ensure that it is available
    // by invoking WaitForOutMu().
    mutex out_mu;
    condition_variable out_cv;
-    bool out_mu_available GUARDED_BY(out_mu);
+    bool out_mu_available TF_GUARDED_BY(out_mu);
    // Values to be shared by all instances, constant after initialization.
-    CollectiveParams shared GUARDED_BY(out_mu);
+    CollectiveParams shared TF_GUARDED_BY(out_mu);
    // If an error occurs during initialization this structure stays in
    // the table with a non-OK status.  Purging the table and restarting
    // needs to be done at a higher level.
-    Status status GUARDED_BY(out_mu);
+    Status status TF_GUARDED_BY(out_mu);
    // These fields are used to count the instances that have called
    // in and become known while resolving broadcast source identity and
    // communicator key.
-    int source_rank GUARDED_BY(out_mu);
+    int source_rank TF_GUARDED_BY(out_mu);
-    string communicator_key GUARDED_BY(out_mu);
+    string communicator_key TF_GUARDED_BY(out_mu);
-    int known_count GUARDED_BY(out_mu);
+    int known_count TF_GUARDED_BY(out_mu);
-    std::vector<bool> known GUARDED_BY(out_mu);
+    std::vector<bool> known TF_GUARDED_BY(out_mu);
-    std::vector<IRConsumer> known_waiters GUARDED_BY(out_mu);
+    std::vector<IRConsumer> known_waiters TF_GUARDED_BY(out_mu);
    InstanceRec()
        : is_init(false),
@ -148,7 +148,7 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
    // If out_mu is unavailable during distributed device locality
    // initialization, wait on out_cv until it is available again.
-    void WaitForOutMu(mutex_lock& lock) EXCLUSIVE_LOCKS_REQUIRED(out_mu);
+    void WaitForOutMu(mutex_lock& lock) TF_EXCLUSIVE_LOCKS_REQUIRED(out_mu);
  };
  // Find the InstanceRec with the same instance_key as cp.  If it doesn't
@ -162,7 +162,7 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
      InstanceRecCallback;
  void FindInstanceRec(const GroupRec* gr, CollectiveParams* cp,
                       const InstanceRecCallback& done)
-      LOCKS_EXCLUDED(instance_mu_, gr->mu, group_mu_);
+      TF_LOCKS_EXCLUDED(instance_mu_, gr->mu, group_mu_);
  // Populate *ir with device membership from gr, then initialize to be specific
  // to cp->instance_key, i.e. order the devices and tasks.
@ -171,26 +171,26 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
  //  cp is populated with all DeviceLocalities
  void InitInstanceSharedParams(const GroupRec* gr, const CollectiveParams* cp,
                                InstanceRec* ir, const StatusCallback& done)
-      UNLOCK_FUNCTION(ir->out_mu) LOCKS_EXCLUDED(gr->mu);
+      TF_UNLOCK_FUNCTION(ir->out_mu) TF_LOCKS_EXCLUDED(gr->mu);
  void CallInitInstanceSharedParams(const GroupRec* gr,
                                    const CollectiveParams* cp, InstanceRec* ir,
                                    const InstanceRecCallback& done)
-      LOCKS_EXCLUDED(ir->out_mu, gr->mu);
+      TF_LOCKS_EXCLUDED(ir->out_mu, gr->mu);
  // Establishes the final order of ir->shared.instance.device_names and
  // ir->shared.instance.task_names by considering localities of all devices.
  void CompleteDefaultRanking(const GroupRec* gr, const CollectiveParams* cp,
                              InstanceRec* ir,
                              const std::vector<DeviceAttributes>& attributes)
-      EXCLUSIVE_LOCKS_REQUIRED(ir->out_mu);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(ir->out_mu);
  // Finish populating *cp.
  // Precondition: *gr has been fully populated by CompleteGroupLocal.
  void CompleteInstanceLocal(const string& device, const GroupRec* gr,
                             CollectiveParams* cp, bool is_source,
                             const StatusCallback& done)
-      LOCKS_EXCLUDED(instance_mu_, gr->mu, group_mu_);
+      TF_LOCKS_EXCLUDED(instance_mu_, gr->mu, group_mu_);
  // Finish populating *cp from fully initialized *ir.
  // Precondition: *gr and *ir are fully populated.
@ -199,12 +199,12 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
                                           CollectiveParams* cp,
                                           InstanceRec* ir, bool is_source,
                                           const StatusCallback& done)
-      LOCKS_EXCLUDED(ir->out_mu);
+      TF_LOCKS_EXCLUDED(ir->out_mu);
  // Complete instance params after waiting for group.
  // Precondition: *cp has complete group data and default_rank.
  void WaitForGroup(InstanceRec* ir, CollectiveParams* cp, bool is_source,
-                    const IRConsumer& f) LOCKS_EXCLUDED(ir->out_mu);
+                    const IRConsumer& f) TF_LOCKS_EXCLUDED(ir->out_mu);
  // If cp.device_names contains only devices local to this process
  // populates *localities, else returns an error.
@ -225,7 +225,7 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
  // Helper to grab status under lock, invoke callback out of lock.
  void CallbackWithStatus(const InstanceRecCallback& done, InstanceRec* irec)
-      LOCKS_EXCLUDED(irec->out_mu);
+      TF_LOCKS_EXCLUDED(irec->out_mu);
  const bool nccl_;
  const DeviceMgr* dev_mgr_;
@ -233,10 +233,10 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
  string task_name_;
  mutex group_mu_;
  gtl::FlatMap<int32, std::unique_ptr<GroupRec>> group_table_
-      GUARDED_BY(group_mu_);
+      TF_GUARDED_BY(group_mu_);
  mutex instance_mu_;
  gtl::FlatMap<int32, std::unique_ptr<InstanceRec>> instance_table_
-      GUARDED_BY(instance_mu_);
+      TF_GUARDED_BY(instance_mu_);
 };
 }  // namespace tensorflow
--- a/tensorflow/core/common_runtime/costmodel_manager.h
+++ b/tensorflow/core/common_runtime/costmodel_manager.h
@ -47,7 +47,7 @@ class CostModelManager {
 private:
  mutex mu_;
-  CostModelMap cost_models_ GUARDED_BY(mu_);
+  CostModelMap cost_models_ TF_GUARDED_BY(mu_);
 };
 }  // namespace tensorflow
--- a/tensorflow/core/common_runtime/device_mgr.h
+++ b/tensorflow/core/common_runtime/device_mgr.h
@ -138,13 +138,14 @@ class DynamicDeviceMgr : public DeviceMgr {
  mutable mutex devices_mu_;
  std::unordered_map<Device*, std::unique_ptr<Device>> dynamic_devices_
-      GUARDED_BY(devices_mu_);
+      TF_GUARDED_BY(devices_mu_);
-  std::unordered_map<string, Device*> device_map_ GUARDED_BY(devices_mu_);
+  std::unordered_map<string, Device*> device_map_ TF_GUARDED_BY(devices_mu_);
-  std::unordered_map<string, int> device_type_counts_ GUARDED_BY(devices_mu_);
+  std::unordered_map<string, int> device_type_counts_
      TF_GUARDED_BY(devices_mu_);
-  mutable Device* cpu_device_ GUARDED_BY(devices_mu_);
+  mutable Device* cpu_device_ TF_GUARDED_BY(devices_mu_);
  TF_DISALLOW_COPY_AND_ASSIGN(DynamicDeviceMgr);
 };
--- a/tensorflow/core/common_runtime/device_set.h
+++ b/tensorflow/core/common_runtime/device_set.h
@ -38,7 +38,7 @@ class DeviceSet {
  ~DeviceSet();
  // Does not take ownership of 'device'.
-  void AddDevice(Device* device) LOCKS_EXCLUDED(devices_mu_);
+  void AddDevice(Device* device) TF_LOCKS_EXCLUDED(devices_mu_);
  // Set the device designated as the "client".  This device
  // must also be registered via AddDevice().
@ -70,7 +70,7 @@ class DeviceSet {
  // Return the prioritized list of devices in this set.
  // Devices are prioritized first by `DeviceTypeOrder`, then by name.
  const PrioritizedDeviceVector& prioritized_devices() const
-      LOCKS_EXCLUDED(devices_mu_);
+      TF_LOCKS_EXCLUDED(devices_mu_);
  // Return the prioritized list of unique device types in this set.
  //
@ -78,7 +78,7 @@ class DeviceSet {
  // element in the list's `std::pair<DeviceType, int32>`) will be initialized
  // to the value of `DeviceTypeOrder` for the device types.
  const PrioritizedDeviceTypeVector& prioritized_device_types() const
-      LOCKS_EXCLUDED(devices_mu_);
+      TF_LOCKS_EXCLUDED(devices_mu_);
  // An order to sort by device types according to system-determined
  // priority.
@ -112,12 +112,13 @@ class DeviceSet {
  // Cached prioritized vector, created on-the-fly when
  // prioritized_devices() is called.
-  mutable PrioritizedDeviceVector prioritized_devices_ GUARDED_BY(devices_mu_);
+  mutable PrioritizedDeviceVector prioritized_devices_
      TF_GUARDED_BY(devices_mu_);
  // Cached prioritized vector, created on-the-fly when
  // prioritized_device_types() is called.
  mutable PrioritizedDeviceTypeVector prioritized_device_types_
-      GUARDED_BY(devices_mu_);
+      TF_GUARDED_BY(devices_mu_);
  // Fullname -> device* for device in devices_.
  std::unordered_map<string, Device*> device_by_name_;
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@ -236,8 +236,9 @@ class DirectSessionFactory : public SessionFactory {
  }
  mutex sessions_lock_;
-  std::vector<DirectSession*> sessions_ GUARDED_BY(sessions_lock_);
+  std::vector<DirectSession*> sessions_ TF_GUARDED_BY(sessions_lock_);
-  absl::flat_hash_set<string> session_metadata_keys_ GUARDED_BY(sessions_lock_);
+  absl::flat_hash_set<string> session_metadata_keys_
      TF_GUARDED_BY(sessions_lock_);
 };
 class DirectSessionRegistrar {
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@ -198,7 +198,7 @@ class DirectSession : public Session {
  // 'status' is the current status of the execution.
  struct RunState {
    mutex mu;
-    Status status GUARDED_BY(mu);
+    Status status TF_GUARDED_BY(mu);
    std::unique_ptr<CollectiveExecutor::Handle> collective_executor;
    std::unique_ptr<StepStatsCollector> collector;
    TensorStore tensor_store;
@ -275,7 +275,7 @@ class DirectSession : public Session {
  bool ShouldUseRunHandlerPool(const RunOptions& run_options) const;
  ::tensorflow::Status ExtendLocked(GraphDef graph)
-      EXCLUSIVE_LOCKS_REQUIRED(graph_state_lock_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(graph_state_lock_);
  ::tensorflow::Status ResourceHandleToInputTensor(
      const Tensor& resource_tensor, Tensor* retrieved_tensor);
@ -343,8 +343,8 @@ class DirectSession : public Session {
  // Unique session identifier.
  string session_handle_;
  mutex graph_state_lock_;
-  bool graph_created_ GUARDED_BY(graph_state_lock_) = false;
+  bool graph_created_ TF_GUARDED_BY(graph_state_lock_) = false;
-  bool finalized_ GUARDED_BY(graph_state_lock_) = false;
+  bool finalized_ TF_GUARDED_BY(graph_state_lock_) = false;
  // The thread-pools to use for running ops, with a bool indicating if the pool
  // is owned.
@ -356,7 +356,7 @@ class DirectSession : public Session {
  bool sync_on_finish_ = true;
  std::vector<std::unique_ptr<FunctionInfo>> functions_
-      GUARDED_BY(executor_lock_);
+      TF_GUARDED_BY(executor_lock_);
  mutex executor_lock_;  // protects executors_
  // Holds mappings from signature to the executors that process
@ -365,7 +365,7 @@ class DirectSession : public Session {
  // The map value is a shared_ptr since multiple map keys can point to the
  // same ExecutorsAndKey object.
  std::unordered_map<string, std::shared_ptr<ExecutorsAndKeys>> executors_
-      GUARDED_BY(executor_lock_);
+      TF_GUARDED_BY(executor_lock_);
  class RunCallableCallFrame;
  struct Callable {
@ -374,12 +374,12 @@ class DirectSession : public Session {
    ~Callable();
  };
  mutex callables_lock_;
-  int64 next_callable_handle_ GUARDED_BY(callables_lock_) = 0;
+  int64 next_callable_handle_ TF_GUARDED_BY(callables_lock_) = 0;
-  std::unordered_map<int64, Callable> callables_ GUARDED_BY(callables_lock_);
+  std::unordered_map<int64, Callable> callables_ TF_GUARDED_BY(callables_lock_);
  // Holds mappings from handle to partial run state.
  std::unordered_map<string, std::unique_ptr<PartialRunState>> partial_runs_
-      GUARDED_BY(executor_lock_);
+      TF_GUARDED_BY(executor_lock_);
  // This holds all the tensors that are currently alive in the session.
  SessionState session_state_;
@ -393,11 +393,11 @@ class DirectSession : public Session {
  // nodes can not be moved to a different device.  Maps node names to
  // device names.
  std::unordered_map<string, string> stateful_placements_
-      GUARDED_BY(graph_state_lock_);
+      TF_GUARDED_BY(graph_state_lock_);
  // Execution_state; used when placing the entire graph.
  std::unique_ptr<GraphExecutionState> execution_state_
-      GUARDED_BY(graph_state_lock_);
+      TF_GUARDED_BY(graph_state_lock_);
  // The function library, before any rewrites or optimizations have been
  // performed. In particular, CreateGraphs() may need to modify the function
@ -406,7 +406,7 @@ class DirectSession : public Session {
  // true if the Session has been Closed.
  mutex closed_lock_;
-  bool closed_ GUARDED_BY(closed_lock_) = false;
+  bool closed_ TF_GUARDED_BY(closed_lock_) = false;
  // For generating unique names for this session instance.
  std::atomic<int64> edge_name_counter_ = {0};
@ -423,7 +423,7 @@ class DirectSession : public Session {
  // For testing collective graph key generation.
  mutex collective_graph_key_lock_;
-  int64 collective_graph_key_ GUARDED_BY(collective_graph_key_lock_) = -1;
+  int64 collective_graph_key_ TF_GUARDED_BY(collective_graph_key_lock_) = -1;
  // Run in caller's thread if RunOptions.inter_op_thread_pool is negative or
  // all of following conditions are met:
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@ -293,11 +293,11 @@ class EagerContext : public core::RefCounted {
  }
  // TODO(apassos) clean up RunMetadata storage.
-  mutex* MetadataMu() LOCK_RETURNED(metadata_mu_) { return &metadata_mu_; }
+  mutex* MetadataMu() TF_LOCK_RETURNED(metadata_mu_) { return &metadata_mu_; }
-  bool ShouldStoreGraphs() LOCKS_EXCLUDED(metadata_mu_);
+  bool ShouldStoreGraphs() TF_LOCKS_EXCLUDED(metadata_mu_);
  void SetShouldStoreGraphs(bool value);
  RunMetadata* RunMetadataProto() { return &run_metadata_; }
-  void ClearRunMetadata() EXCLUSIVE_LOCKS_REQUIRED(metadata_mu_);
+  void ClearRunMetadata() TF_EXCLUSIVE_LOCKS_REQUIRED(metadata_mu_);
  void ListDevices(std::vector<tensorflow::DeviceAttributes>* devices);
@ -511,9 +511,9 @@ class EagerContext : public core::RefCounted {
  // thread-local-object-local variable in C++11.
  mutable mutex policy_map_mu_;
  std::unordered_map<std::thread::id, ContextDevicePlacementPolicy>
-      device_placement_policy_ GUARDED_BY(policy_map_mu_);
+      device_placement_policy_ TF_GUARDED_BY(policy_map_mu_);
  std::unordered_map<std::thread::id, ContextMirroringPolicy> mirroring_policy_
-      GUARDED_BY(policy_map_mu_);
+      TF_GUARDED_BY(policy_map_mu_);
  OwnedOrUnownedHelper<const DeviceMgr> local_device_manager_;
@ -552,14 +552,14 @@ class EagerContext : public core::RefCounted {
  };
  std::unordered_map<Fprint128, core::RefCountPtr<KernelAndDevice>,
                     Fprint128Hasher>
-      kernel_cache_ GUARDED_BY(cache_mu_);
+      kernel_cache_ TF_GUARDED_BY(cache_mu_);
  std::unordered_map<string, RegisteredFunction*> registered_functions_
-      GUARDED_BY(cache_mu_);
+      TF_GUARDED_BY(cache_mu_);
  // Whether we should compute RunMetadata.
  std::atomic<bool> should_store_graphs_{false};
  mutex metadata_mu_;
-  RunMetadata run_metadata_ GUARDED_BY(metadata_mu_);
+  RunMetadata run_metadata_ TF_GUARDED_BY(metadata_mu_);
  GraphCollector graph_collector_;
  // TODO(fishx): Allow update following two bool after context creation.
  const bool log_device_placement_;
@ -567,13 +567,14 @@ class EagerContext : public core::RefCounted {
  // Information related to step containers.
  std::atomic<int> num_active_steps_;
-  std::unique_ptr<ScopedStepContainer> step_container_ GUARDED_BY(metadata_mu_);
+  std::unique_ptr<ScopedStepContainer> step_container_
      TF_GUARDED_BY(metadata_mu_);
  EagerExecutor default_executor_;
  mutable mutex executor_map_mu_;
  // Not owned.
  std::unordered_map<std::thread::id, EagerExecutor*> thread_local_executor_
-      GUARDED_BY(executor_map_mu_);
+      TF_GUARDED_BY(executor_map_mu_);
  const bool log_memory_;
@ -607,28 +608,28 @@ class EagerContext : public core::RefCounted {
  mutable mutex remote_state_mu_;
-  uint64 context_id_ GUARDED_BY(remote_state_mu_);
+  uint64 context_id_ TF_GUARDED_BY(remote_state_mu_);
  // The view id of an eager context should be set to 0 when context is created,
  // and continuously incremented when context with the same context_id gets
  // updated. The view id should be consistent between master and workers.
-  uint64 context_view_id_ GUARDED_BY(remote_state_mu_);
+  uint64 context_view_id_ TF_GUARDED_BY(remote_state_mu_);
  std::vector<string> remote_contexts_;
-  int keep_alive_secs_ GUARDED_BY(remote_state_mu_);
+  int keep_alive_secs_ TF_GUARDED_BY(remote_state_mu_);
  std::atomic<int> sleep_for_secs_;
  std::unique_ptr<Thread> keep_alive_thread_;
  mutex keep_alive_thread_shutdown_mu_;
  condition_variable keep_alive_thread_cv_;
-  bool shutting_down_ GUARDED_BY(keep_alive_thread_shutdown_mu_) = false;
+  bool shutting_down_ TF_GUARDED_BY(keep_alive_thread_shutdown_mu_) = false;
  std::unique_ptr<eager::RemoteMgr, std::function<void(eager::RemoteMgr*)>>
      remote_mgr_;
-  bool is_master_ GUARDED_BY(remote_state_mu_);
+  bool is_master_ TF_GUARDED_BY(remote_state_mu_);
  // Maps from a remote worker to a list of parsed device filters.
  std::unordered_map<string, std::vector<DeviceNameUtils::ParsedName>>
-      cluster_device_filters_ GUARDED_BY(remote_state_mu_);
+      cluster_device_filters_ TF_GUARDED_BY(remote_state_mu_);
 #endif  // IS_MOBILE_PLATFORM
--- a/tensorflow/core/common_runtime/eager/eager_executor.h
+++ b/tensorflow/core/common_runtime/eager/eager_executor.h
@ -151,7 +151,7 @@ class EagerExecutor {
    return status_;
  }
-  bool ok() const NO_THREAD_SAFETY_ANALYSIS { return ok_; }
+  bool ok() const TF_NO_THREAD_SAFETY_ANALYSIS { return ok_; }
 private:
  // Possible states for this executor.
@ -183,11 +183,12 @@ class EagerExecutor {
    NodeState state;
  };
-  const char* StateStringLocked() EXCLUSIVE_LOCKS_REQUIRED(node_queue_mutex_);
+  const char* StateStringLocked()
      TF_EXCLUSIVE_LOCKS_REQUIRED(node_queue_mutex_);
  void NodeDone(const core::RefCountPtr<NodeItem>& item, const Status& status,
                bool from_queue);
-  void NotifyWaiters(uint64 id) EXCLUSIVE_LOCKS_REQUIRED(node_queue_mutex_);
+  void NotifyWaiters(uint64 id) TF_EXCLUSIVE_LOCKS_REQUIRED(node_queue_mutex_);
  // Starts execution of pending EagerNodes. This function loops till executor
  // state_ is set to kShutDown. If any errors are encountered, these are set
@ -201,7 +202,7 @@ class EagerExecutor {
  // The impl of WaitForAllPendingNodes
  // `lock` is the lock that holds node_queue_mutex_.
  Status WaitForAllPendingNodesLocked(mutex_lock* lock)
-      EXCLUSIVE_LOCKS_REQUIRED(node_queue_mutex_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(node_queue_mutex_);
  Status WaitImpl(bool wait_all, uint64 node_id);
@ -210,27 +211,27 @@ class EagerExecutor {
  mutable mutex node_queue_mutex_;
  // Used to signal that some EagerNodes are pending execution.
-  condition_variable nodes_pending_ GUARDED_BY(node_queue_mutex_);
+  condition_variable nodes_pending_ TF_GUARDED_BY(node_queue_mutex_);
  // Queue of pending NodeItems. Ordered by NodeItem::id.
  std::queue<core::RefCountPtr<NodeItem>> node_queue_
-      GUARDED_BY(node_queue_mutex_);
+      TF_GUARDED_BY(node_queue_mutex_);
  // Ordered by NodeItem::id.
  std::map<uint64, core::RefCountPtr<NodeItem>, std::less<uint64>>
-      unfinished_nodes_ GUARDED_BY(node_queue_mutex_);
+      unfinished_nodes_ TF_GUARDED_BY(node_queue_mutex_);
  // `status_` is set based on any errors raised during execution of a
  // EagerNode.  It remains set until ClearError is called.
-  Status status_ GUARDED_BY(node_queue_mutex_);
+  Status status_ TF_GUARDED_BY(node_queue_mutex_);
-  std::atomic<bool> ok_ GUARDED_BY(node_queue_mutex_);
+  std::atomic<bool> ok_ TF_GUARDED_BY(node_queue_mutex_);
  // Map from id of a EagerNode to condition_variables (not owned by the map).
  // These condition_variables are notified and removed when that EagerNode is
  // done executing, or if an error is found in execution of any EagerNode.
  // The map is ordered by id.
  std::multimap<uint64, condition_variable*, std::less<uint64>>
-      node_done_notifications_ GUARDED_BY(node_queue_mutex_);
+      node_done_notifications_ TF_GUARDED_BY(node_queue_mutex_);
  // thread_exited_notification_ is notified by the `thread_` right before it
  // exits.
@ -238,7 +239,8 @@ class EagerExecutor {
  // When state_ is set to kShutDown, it indicates that `thread_` should stop as
  // soon as it is done executing the current EagerNode.
-  ExecutorState state_ GUARDED_BY(node_queue_mutex_) = ExecutorState::kActive;
+  ExecutorState state_ TF_GUARDED_BY(node_queue_mutex_) =
      ExecutorState::kActive;
  // Thread object that calls the `Run` method in async mode.This thread runs
  // until state_ is set to kShuttingDown. It is `nullptr` in sync mode.
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@ -241,17 +241,17 @@ class TensorHandle : public core::RefCounted {
  // Map of local mirrors. This can include both ready and non-ready mirrors.
  std::unordered_map<const tensorflow::Device*, LocalTensorHandleData>
-      local_mirrors_ GUARDED_BY(mu_);
+      local_mirrors_ TF_GUARDED_BY(mu_);
 #if !defined(IS_MOBILE_PLATFORM)
  // TODO(yujingzhang): Remove resource_shape_mirrors_ once scalable per-replica
  // variable is ready, since we could get the shape locally without remote copy
  // then.
  std::unordered_map<string, RemoteTensorHandleData> resource_shape_mirrors_
-      GUARDED_BY(mu_);
+      TF_GUARDED_BY(mu_);
  // TODO(gjn): Is std::map the most optimal choice here? Perhaps this should be
  // a fixed size map.
  std::unordered_map<string, RemoteTensorHandleData> remote_mirrors_
-      GUARDED_BY(mu_);
+      TF_GUARDED_BY(mu_);
 #endif
  // `ctx` is only guaranteed to be set if the handle is not "ready". This is
--- a/tensorflow/core/common_runtime/eager/tensor_handle_data.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_data.h
@ -98,8 +98,8 @@ class LocalTensorHandleData {
   private:
    mutable mutex mu_;
-    bool is_ready_ GUARDED_BY(mu_);
+    bool is_ready_ TF_GUARDED_BY(mu_);
-    Status is_poisoned_ GUARDED_BY(mu_);
+    Status is_poisoned_ TF_GUARDED_BY(mu_);
  };
  absl::variant<NonBlockingControl, BlockingControl> ctrl_;
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@ -1117,16 +1117,16 @@ class ExecutorState {
    int num_pending_inputs = 0;
    // The highest iteration number we have reached so far in this frame.
-    int64 iteration_count GUARDED_BY(mu) = 0;
+    int64 iteration_count TF_GUARDED_BY(mu) = 0;
    // The number of outstanding iterations.
-    int num_outstanding_iterations GUARDED_BY(mu) = 1;
+    int num_outstanding_iterations TF_GUARDED_BY(mu) = 1;
   private:
    // The active iteration states of this frame.
    gtl::InlinedVector<IterationState*, 12> iterations;
-    IterationState** const iterations_raw GUARDED_BY(mu);
+    IterationState** const iterations_raw TF_GUARDED_BY(mu);
-    IterationState* iterations_first GUARDED_BY(mu);
+    IterationState* iterations_first TF_GUARDED_BY(mu);
   public:
    // The NextIteration nodes to enter a new iteration. If the number of
@ -1134,18 +1134,18 @@ class ExecutorState {
    // the next iteration until the number of outstanding iterations falls
    // below the limit.
    std::vector<std::pair<const NodeItem*, Entry>> next_iter_roots
-        GUARDED_BY(mu);
+        TF_GUARDED_BY(mu);
    // The values of the loop invariants for this loop. They are added into
    // this list as they "enter" the frame. When a loop invariant enters,
    // we make it available to all active iterations. When the frame starts
    // a new iteration, we make all the current loop invariants available
    // to the new iteration.
-    std::vector<std::pair<const NodeItem*, Entry>> inv_values GUARDED_BY(mu);
+    std::vector<std::pair<const NodeItem*, Entry>> inv_values TF_GUARDED_BY(mu);
    // The list of dead exit node items for the current highest iteration. We
    // will only "execute" the dead exits of the final iteration.
-    std::vector<const NodeItem*> dead_exits GUARDED_BY(mu);
+    std::vector<const NodeItem*> dead_exits TF_GUARDED_BY(mu);
    // Static information specific to this frame.
    PendingCounts* pending_counts = nullptr;
@ -1167,7 +1167,7 @@ class ExecutorState {
    }
    inline IterationState* GetIteration(int64 iter)
-        EXCLUSIVE_LOCKS_REQUIRED(mu) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu) {
      if (TF_PREDICT_TRUE(iter == 0)) {
        return iterations_first;
      } else {
@ -1177,7 +1177,7 @@ class ExecutorState {
    }
    inline void SetIteration(int64 iter, IterationState* state)
-        EXCLUSIVE_LOCKS_REQUIRED(mu) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu) {
      size_t index = iter % (max_parallel_iterations + 1);
      DCHECK(state == nullptr || iterations[index] == nullptr);
      iterations_raw[index] = state;
@ -1198,7 +1198,7 @@ class ExecutorState {
    // frame. Return true iff the execution of the frame is done.
    inline bool DecrementOutstandingOpsLocked(const GraphView* gview,
                                              int64 iter, TaggedNodeSeq* ready)
-        EXCLUSIVE_LOCKS_REQUIRED(mu) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu) {
      IterationState* istate = GetIteration(iter);
      istate->outstanding_ops--;
      if (istate->outstanding_ops != 0) {
@ -1209,39 +1209,40 @@ class ExecutorState {
    }
    // Returns true if the computation in the frame is completed.
-    inline bool IsFrameDone() EXCLUSIVE_LOCKS_REQUIRED(mu) {
+    inline bool IsFrameDone() TF_EXCLUSIVE_LOCKS_REQUIRED(mu) {
      return (num_pending_inputs == 0 && num_outstanding_iterations == 0);
    }
    // Returns true if the iteration of the frame is completed.
-    bool IsIterationDone(int64 iter) EXCLUSIVE_LOCKS_REQUIRED(mu);
+    bool IsIterationDone(int64 iter) TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
    // Increments the iteration id. If this is a new iteration, initialize it.
    void IncrementIteration(const GraphView* gview, TaggedNodeSeq* ready)
-        EXCLUSIVE_LOCKS_REQUIRED(mu);
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
    // Activate all the deferred NextIteration nodes in a new iteration.
    void ActivateNexts(const GraphView* gview, int64 iter, TaggedNodeSeq* ready)
-        EXCLUSIVE_LOCKS_REQUIRED(mu);
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
    // Activate all the current loop invariants in a new iteration.
    void ActivateLoopInvs(const GraphView* gview, int64 iter,
-                          TaggedNodeSeq* ready) EXCLUSIVE_LOCKS_REQUIRED(mu);
+                          TaggedNodeSeq* ready) TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
    // Add a new loop invariant and make it available to all active
    // iterations.
    void AddLoopInv(const NodeItem* item, const Entry& entry,
-                    TaggedNodeSeq* ready) EXCLUSIVE_LOCKS_REQUIRED(mu);
+                    TaggedNodeSeq* ready) TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
    // Activate the successors of a node. Contents of *outputs are left in an
    // indeterminate state after returning from this method.
    void ActivateNodes(const NodeItem* item, const bool is_dead, int64 iter,
                       EntryVector* outputs, TaggedNodeSeq* ready)
-        EXCLUSIVE_LOCKS_REQUIRED(mu);
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
    // Cleanup iterations of this frame starting from iteration iter.
    bool CleanupIterations(const GraphView* gview, int64 iter,
-                           TaggedNodeSeq* ready) EXCLUSIVE_LOCKS_REQUIRED(mu);
+                           TaggedNodeSeq* ready)
        TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
    void DumpIterationState(ExecutorState* parent) {
      mutex_lock l(mu);
@ -1361,18 +1362,19 @@ class ExecutorState {
  // Available via OpKernelContext to every OpKernel invocation.
  mutex num_deferred_ops_mu_;
-  int64 num_deferred_ops_ GUARDED_BY(num_deferred_ops_mu_) = 0;
+  int64 num_deferred_ops_ TF_GUARDED_BY(num_deferred_ops_mu_) = 0;
-  bool finish_when_deferred_ops_done_ GUARDED_BY(num_deferred_ops_mu_) = false;
+  bool finish_when_deferred_ops_done_ TF_GUARDED_BY(num_deferred_ops_mu_) =
      false;
  mutex mu_;
-  Status status_ GUARDED_BY(mu_);
+  Status status_ TF_GUARDED_BY(mu_);
  // Mapping from frame name to outstanding frames. A new frame is created
  // at some iteration of an active frame. So the unique key for the new
  // child frame is composed of the name of the parent frame, the iteration
  // number at which the parent frame is creating the new frame, and the
  // name of the new frame from nodedef.
-  gtl::FlatMap<string, FrameState*> outstanding_frames_ GUARDED_BY(mu_);
+  gtl::FlatMap<string, FrameState*> outstanding_frames_ TF_GUARDED_BY(mu_);
  // The unique name of a frame.
  inline string MakeFrameName(FrameState* frame, int64 iter_id,
@ -1452,7 +1454,7 @@ class ExecutorState {
  // resizes and this particular iteration's array element will not
  // be changed out from under us because the iteration is still alive).
  Entry* GetInputTensors(FrameState* input_frame,
-                         int64 input_iter) const NO_THREAD_SAFETY_ANALYSIS {
+                         int64 input_iter) const TF_NO_THREAD_SAFETY_ANALYSIS {
    return input_frame->GetIteration(input_iter)->input_tensors;
  }
 };
--- a/tensorflow/core/common_runtime/executor.h
+++ b/tensorflow/core/common_runtime/executor.h
@ -188,8 +188,8 @@ class ExecutorBarrier {
  StatusCallback done_cb_ = nullptr;
  mutable mutex mu_;
-  int pending_ GUARDED_BY(mu_) = 0;
+  int pending_ TF_GUARDED_BY(mu_) = 0;
-  StatusGroup status_group_ GUARDED_BY(mu_);
+  StatusGroup status_group_ TF_GUARDED_BY(mu_);
  void WhenDone(const Status& s) {
    Rendezvous* error_rendez = nullptr;
--- a/tensorflow/core/common_runtime/executor_factory.cc
+++ b/tensorflow/core/common_runtime/executor_factory.cc
@ -48,7 +48,7 @@ void ExecutorFactory::Register(const string& executor_type,
 namespace {
 const string RegisteredFactoriesErrorMessageLocked()
-    SHARED_LOCKS_REQUIRED(executor_factory_lock) {
+    TF_SHARED_LOCKS_REQUIRED(executor_factory_lock) {
  std::vector<string> factory_types;
  for (const auto& executor_factory : *executor_factories()) {
    factory_types.push_back(executor_factory.first);
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@ -402,7 +402,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
  mutable mutex mu_;
-  int next_handle_ GUARDED_BY(mu_);
+  int next_handle_ TF_GUARDED_BY(mu_);
  // The instantiated and transformed function is encoded as a Graph
  // object, and an executor is created for the graph.
@ -423,7 +423,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
    }
  };
  std::unique_ptr<std::unordered_map<Handle, std::unique_ptr<Item>>> items_
-      GUARDED_BY(mu_);
+      TF_GUARDED_BY(mu_);
  ProcessFunctionLibraryRuntime* parent_ = nullptr;  // not owned.
--- a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc
@ -58,7 +58,7 @@ class FakeAllocator {
  AllocatorRetry retry_;
  void* good_ptr_ = reinterpret_cast<void*>(0xdeadbeef);
  mutex mu_;
-  size_t memory_capacity_ GUARDED_BY(mu_);
+  size_t memory_capacity_ TF_GUARDED_BY(mu_);
  int millis_to_wait_;
 };
@ -100,7 +100,7 @@ class AlternatingBarrier {
  }
 private:
-  void IncrementTurn() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  void IncrementTurn() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
    int skipped = 0;
    while (skipped < num_users_) {
      next_turn_ = (next_turn_ + 1) % num_users_;
@ -112,8 +112,8 @@ class AlternatingBarrier {
  mutex mu_;
  condition_variable cv_;
  int num_users_;
-  int next_turn_ GUARDED_BY(mu_);
+  int next_turn_ TF_GUARDED_BY(mu_);
-  std::vector<bool> done_ GUARDED_BY(mu_);
+  std::vector<bool> done_ TF_GUARDED_BY(mu_);
 };
 class GPUAllocatorRetryTest : public ::testing::Test {
@ -174,8 +174,8 @@ class GPUAllocatorRetryTest : public ::testing::Test {
  std::vector<int> consumer_count_;
  Notification notifier_;
  mutex mu_;
-  bool has_failed_ GUARDED_BY(mu_) = false;
+  bool has_failed_ TF_GUARDED_BY(mu_) = false;
-  int count_ GUARDED_BY(mu_) = 0;
+  int count_ TF_GUARDED_BY(mu_) = 0;
 };
 // Verifies correct retrying when memory is slightly overcommitted but
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@ -233,7 +233,7 @@ class GPUKernelTracker {
  // Caller is responsible for ensuring that RecordTerminate() is eventually
  // called with the same counter value.
  void RecordQueued(uint64 queued_count, int weight)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
  // Takes a count value returned by RecordQueued and finds the corresponding
  // PendingKernel record in the ring buffer.  Marks the kernel as completed and
@ -259,7 +259,7 @@ class GPUKernelTracker {
  // Yield current thread until number of pending kernels no longer
  // exceeds the cap.
-  void PauseWhilePendingExceeds(int cap) LOCKS_EXCLUDED(mu_) {
+  void PauseWhilePendingExceeds(int cap) TF_LOCKS_EXCLUDED(mu_) {
    mutex_lock l(mu_);
    while (num_pending_ > cap) {
      VLOG(1) << "num_pending_=" << num_pending_ << " cap=" << cap;
@ -293,20 +293,20 @@ class GPUKernelTracker {
    PendingKernel() : queued_count(0), weight(0), terminated(false) {}
  };
  mutex mu_;
-  int32 mem_since_last_ GUARDED_BY(mu_);
+  int32 mem_since_last_ TF_GUARDED_BY(mu_);
-  int32 ops_since_last_ GUARDED_BY(mu_);
+  int32 ops_since_last_ TF_GUARDED_BY(mu_);
  // Ring buffer of PendingKernel records.
-  std::vector<PendingKernel> pending_kernels_ GUARDED_BY(mu_);
+  std::vector<PendingKernel> pending_kernels_ TF_GUARDED_BY(mu_);
  // Next unused slot in pending_kernels_.
-  int first_available_ GUARDED_BY(mu_) = 0;
+  int first_available_ TF_GUARDED_BY(mu_) = 0;
  // Last completed PendingKernel such that all prior PendingKernels are
  // also completed.  With out-of-order completion there may be a mixture
  // of completed and uncompleted entries between last_completed_ and
  // first_available_.
-  int last_completed_ GUARDED_BY(mu_) = -1;
+  int last_completed_ TF_GUARDED_BY(mu_) = -1;
  // Sum of weights of the outstanding events marking tracked kernels.
-  int num_pending_ GUARDED_BY(mu_) = 0;
+  int num_pending_ TF_GUARDED_BY(mu_) = 0;
-  condition_variable pending_decreased_ GUARDED_BY(mu_);
+  condition_variable pending_decreased_ TF_GUARDED_BY(mu_);
 };
 class BaseGPUDeviceFactory : public DeviceFactory {
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
@ -112,9 +112,9 @@ class EventMgr {
  const int64 deferred_bytes_threshold_;
  const int32 polling_active_delay_usecs_;
  mutex mu_;
-  condition_variable events_pending_ GUARDED_BY(mu_);
+  condition_variable events_pending_ TF_GUARDED_BY(mu_);
-  void FlushAccumulatedTensors() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  void FlushAccumulatedTensors() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
  struct InUse {
    se::Event* event;
@ -152,20 +152,20 @@ class EventMgr {
  // Tensors and/or a BufRec to be deleted only after the Event
  // records.
  void QueueInUse(se::Stream* stream, InUse in_use)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
  void QueueTensors(se::Stream* stream, TensorReferenceVector* tensors)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
    QueueInUse(stream, {nullptr, tensors, BufRec(), nullptr});
  }
  void QueueBuffer(se::Stream* stream, BufRec bufrec)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
    QueueInUse(stream, {nullptr, nullptr, bufrec, nullptr});
  }
  void QueueFunc(se::Stream* stream, std::function<void()> func)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
    QueueInUse(stream, {nullptr, nullptr, BufRec(), std::move(func)});
  }
@ -175,7 +175,7 @@ class EventMgr {
  // to "*to_free".  The caller should call FreeMemory(to_free)
  // when this returns.
  void PollEvents(bool is_dedicated_poller, ToFreeVector* to_free)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
  // An internal polling loop that runs at a low frequency to clear
  // straggler Events.
@ -186,18 +186,18 @@ class EventMgr {
  void StopPollingLoop();
  // A stack of unused events
-  std::vector<se::Event*> free_events_ GUARDED_BY(mu_);
+  std::vector<se::Event*> free_events_ TF_GUARDED_BY(mu_);
  // Buffered list of tensors waiting to have an event queued for deletion
-  se::Stream* accumulated_stream_ GUARDED_BY(mu_);
+  se::Stream* accumulated_stream_ TF_GUARDED_BY(mu_);
-  TensorReferenceVector* accumulated_tensors_ GUARDED_BY(mu_);
+  TensorReferenceVector* accumulated_tensors_ TF_GUARDED_BY(mu_);
  // Sum of the TotalBytes() of the tensors in "accumulated_tensors_"
-  int64 accumulated_tensor_bytes_ GUARDED_BY(mu_);
+  int64 accumulated_tensor_bytes_ TF_GUARDED_BY(mu_);
  // A FIFO queue of InUse events and associated tensors.
-  std::deque<InUse> used_events_ GUARDED_BY(mu_);
+  std::deque<InUse> used_events_ TF_GUARDED_BY(mu_);
-  bool stop_polling_ GUARDED_BY(mu_);
+  bool stop_polling_ TF_GUARDED_BY(mu_);
  std::unique_ptr<Notification> polling_stopped_;
  // The main PollLoop for the event manager runs in this threadpool.
@ -216,7 +216,7 @@ class EventMgrFactory {
  // Maintain one EventMgr per physical device (StreamExecutor is
  // per-physical-device).
-  std::map<se::StreamExecutor*, EventMgr*> event_mgr_map_ GUARDED_BY(mu_);
+  std::map<se::StreamExecutor*, EventMgr*> event_mgr_map_ TF_GUARDED_BY(mu_);
 };
 }  // namespace tensorflow
--- a/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc
@ -35,7 +35,7 @@ class TfToPlatformGpuIdMap {
  }
  Status Insert(TfGpuId tf_gpu_id, PlatformGpuId platform_gpu_id)
-      LOCKS_EXCLUDED(mu_) {
+      TF_LOCKS_EXCLUDED(mu_) {
    std::pair<IdMapType::iterator, bool> result;
    {
      mutex_lock lock(mu_);
@ -58,7 +58,7 @@ class TfToPlatformGpuIdMap {
  }
  bool Find(TfGpuId tf_gpu_id, PlatformGpuId* platform_gpu_id) const
-      LOCKS_EXCLUDED(mu_) {
+      TF_LOCKS_EXCLUDED(mu_) {
    // TODO(mrry): Consider replacing this with an atomic `is_initialized` bit,
    // to avoid writing to a shared cache line in the tf_shared_lock.
    tf_shared_lock lock(mu_);
@ -71,14 +71,14 @@ class TfToPlatformGpuIdMap {
 private:
  TfToPlatformGpuIdMap() = default;
-  void TestOnlyReset() LOCKS_EXCLUDED(mu_) {
+  void TestOnlyReset() TF_LOCKS_EXCLUDED(mu_) {
    mutex_lock lock(mu_);
    id_map_.clear();
  }
  using IdMapType = std::unordered_map<int32, int32>;
  mutable mutex mu_;
-  IdMapType id_map_ GUARDED_BY(mu_);
+  IdMapType id_map_ TF_GUARDED_BY(mu_);
  friend class ::tensorflow::GpuIdManager;
  TF_DISALLOW_COPY_AND_ASSIGN(TfToPlatformGpuIdMap);
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.h
@ -56,13 +56,13 @@ class GPUProcessState {
  // Query whether any GPU device has been created so far.
  // Disable thread safety analysis since a race is benign here.
-  bool HasGPUDevice() const NO_THREAD_SAFETY_ANALYSIS {
+  bool HasGPUDevice() const TF_NO_THREAD_SAFETY_ANALYSIS {
    return gpu_device_enabled_;
  }
  // Set the flag to indicate a GPU device has been created.
  // Disable thread safety analysis since a race is benign here.
-  void EnableGPUDevice() NO_THREAD_SAFETY_ANALYSIS {
+  void EnableGPUDevice() TF_NO_THREAD_SAFETY_ANALYSIS {
    gpu_device_enabled_ = true;
  }
@ -147,14 +147,15 @@ class GPUProcessState {
    SubAllocator* sub_allocator;  // owned by allocator
    std::unique_ptr<Allocator> recording_allocator;
  };
-  std::vector<AllocatorParts> gpu_allocators_ GUARDED_BY(mu_);
+  std::vector<AllocatorParts> gpu_allocators_ TF_GUARDED_BY(mu_);
-  std::vector<std::vector<SubAllocator::Visitor>> gpu_visitors_ GUARDED_BY(mu_);
+  std::vector<std::vector<SubAllocator::Visitor>> gpu_visitors_
      TF_GUARDED_BY(mu_);
-  std::vector<AllocatorParts> gpu_host_allocators_ GUARDED_BY(mu_);
+  std::vector<AllocatorParts> gpu_host_allocators_ TF_GUARDED_BY(mu_);
  std::vector<std::vector<SubAllocator::Visitor>> gpu_host_alloc_visitors_
-      GUARDED_BY(mu_);
+      TF_GUARDED_BY(mu_);
  std::vector<std::vector<SubAllocator::Visitor>> gpu_host_free_visitors_
-      GUARDED_BY(mu_);
+      TF_GUARDED_BY(mu_);
 };
 }  // namespace tensorflow
--- a/tensorflow/core/common_runtime/graph_runner.cc
+++ b/tensorflow/core/common_runtime/graph_runner.cc
@ -86,7 +86,7 @@ class SimpleRendezvous : public RendezvousInterface {
  typedef std::unordered_map<string, Tensor> Table;
  mutex mu_;
-  Table table_ GUARDED_BY(mu_);
+  Table table_ TF_GUARDED_BY(mu_);
 };
 }  // namespace
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
@ -312,7 +312,7 @@ void HierarchicalTreeBroadcaster::RunTree() {
    }
    mutex mu;               // also guards status_ while callbacks are pending
-    int pending_count = 0;  // GUARDED_BY(mu)
+    int pending_count = 0;  // TF_GUARDED_BY(mu)
    condition_variable all_done;
    if (my_rank >= 0 && my_rank != source_rank) {
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
@ -188,7 +188,7 @@ class FailTestRMA : public CollectiveRemoteAccessLocal {
  }
  mutex mu_;
-  int fail_after_ GUARDED_BY(mu_);
+  int fail_after_ TF_GUARDED_BY(mu_);
 };
 class HierarchicalTreeBroadcasterTest : public ::testing::Test {
@ -725,9 +725,9 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
  std::unique_ptr<tensorflow::DeviceMgr> dev_mgr_;
  std::unique_ptr<string> gpu_ring_order_;
  mutex mu_;
-  int bcast_recv_counter_ GUARDED_BY(mu_) = 0;
+  int bcast_recv_counter_ TF_GUARDED_BY(mu_) = 0;
-  int bcast_send_counter_ GUARDED_BY(mu_) = 0;
+  int bcast_send_counter_ TF_GUARDED_BY(mu_) = 0;
-  int failure_count_ GUARDED_BY(mu_) = 0;
+  int failure_count_ TF_GUARDED_BY(mu_) = 0;
 };
 TEST_F(HierarchicalTreeBroadcasterTest, InitializeParams1Task8GPU) {
--- a/tensorflow/core/common_runtime/local_device.h
+++ b/tensorflow/core/common_runtime/local_device.h
@ -52,7 +52,7 @@ class LocalDevice : public Device {
  // computations.
  static mutex global_tp_mu_;
  static gtl::InlinedVector<EigenThreadPoolInfo*, 4> global_tp_info_
-      GUARDED_BY(global_tp_mu_);
+      TF_GUARDED_BY(global_tp_mu_);
  friend class test::Benchmark;
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@ -98,7 +98,7 @@ class MklSmallSizeAllocator : public Allocator {
 private:
  // Increment statistics for the allocator handling small allocations.
-  inline void IncrementStats(size_t alloc_size) LOCKS_EXCLUDED(mutex_) {
+  inline void IncrementStats(size_t alloc_size) TF_LOCKS_EXCLUDED(mutex_) {
    mutex_lock l(mutex_);
    ++stats_.num_allocs;
    stats_.bytes_in_use += alloc_size;
@ -109,7 +109,7 @@ class MklSmallSizeAllocator : public Allocator {
  }
  // Decrement statistics for the allocator handling small allocations.
-  inline void DecrementStats(size_t dealloc_size) LOCKS_EXCLUDED(mutex_) {
+  inline void DecrementStats(size_t dealloc_size) TF_LOCKS_EXCLUDED(mutex_) {
    mutex_lock l(mutex_);
    stats_.bytes_in_use -= dealloc_size;
  }
@ -123,7 +123,7 @@ class MklSmallSizeAllocator : public Allocator {
  string name_;
  // Allocator stats for small allocs
-  AllocatorStats stats_ GUARDED_BY(mutex_);
+  AllocatorStats stats_ TF_GUARDED_BY(mutex_);
 };
 /// CPU allocator for MKL that wraps BFC allocator and intercepts
@ -199,19 +199,20 @@ class MklCPUAllocator : public Allocator {
  inline string Name() override { return kName; }
  inline bool IsSmallSizeAllocation(const void* ptr) const
-      LOCKS_EXCLUDED(mutex_) {
+      TF_LOCKS_EXCLUDED(mutex_) {
    mutex_lock l(mutex_);
    return large_allocations_map_.find(ptr) == large_allocations_map_.end();
  }
  // AddLargeAllocMap and RemoveLargeAllocMap are always called with a lock held
  inline void AddLargeAllocMap(void* ptr, size_t num_bytes)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
    if (ptr != nullptr) {
      std::pair<void*, size_t> map_val(ptr, num_bytes);
      large_allocations_map_.insert(map_val);
    }
  }
-  inline void RemoveLargeAllocMap(void* ptr) EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+  inline void RemoveLargeAllocMap(void* ptr)
      TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
    auto map_iter = large_allocations_map_.find(ptr);
    if (map_iter != large_allocations_map_.end()) {
      large_allocations_map_.erase(map_iter);
@ -313,12 +314,12 @@ class MklCPUAllocator : public Allocator {
  SubAllocator* sub_allocator_;  // not owned by this class
  mutable mutex mutex_;
-  AllocatorStats stats_ GUARDED_BY(mutex_);
+  AllocatorStats stats_ TF_GUARDED_BY(mutex_);
  // Hash map to keep track of "BFC" allocations
  // We do not use BFC allocator for small allocations.
  std::unordered_map<const void*, size_t> large_allocations_map_
-      GUARDED_BY(mutex_);
+      TF_GUARDED_BY(mutex_);
  // Size in bytes that defines the upper-bound for "small" allocations.
  // Any allocation below this threshold is "small" allocation.
--- a/tensorflow/core/common_runtime/pool_allocator.h
+++ b/tensorflow/core/common_runtime/pool_allocator.h
@ -81,21 +81,21 @@ class PoolAllocator : public Allocator {
  // consistency with other threads is not important.
  // Number of Get() requests satisfied from pool.
-  int64 get_from_pool_count() const NO_THREAD_SAFETY_ANALYSIS {
+  int64 get_from_pool_count() const TF_NO_THREAD_SAFETY_ANALYSIS {
    return get_from_pool_count_;
  }
  // Number of Put() requests.
-  int64 put_count() const NO_THREAD_SAFETY_ANALYSIS { return put_count_; }
+  int64 put_count() const TF_NO_THREAD_SAFETY_ANALYSIS { return put_count_; }
  // Number of Get() requests requiring a fresh allocation.
-  int64 allocated_count() const NO_THREAD_SAFETY_ANALYSIS {
+  int64 allocated_count() const TF_NO_THREAD_SAFETY_ANALYSIS {
    return allocated_count_;
  }
  // Number of pool evictions.
-  int64 evicted_count() const NO_THREAD_SAFETY_ANALYSIS {
+  int64 evicted_count() const TF_NO_THREAD_SAFETY_ANALYSIS {
    return evicted_count_;
  }
  // Current size limit.
-  size_t size_limit() const NO_THREAD_SAFETY_ANALYSIS {
+  size_t size_limit() const TF_NO_THREAD_SAFETY_ANALYSIS {
    return pool_size_limit_;
  }
@ -108,13 +108,13 @@ class PoolAllocator : public Allocator {
  };
  // Remove "pr" from the double-linked LRU list.
-  void RemoveFromList(PtrRecord* pr) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  void RemoveFromList(PtrRecord* pr) TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
  // Add "pr" to the head of the double-linked LRU list.
-  void AddToList(PtrRecord* pr) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  void AddToList(PtrRecord* pr) TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
  // Delete the least recently used record.
-  void EvictOne() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  void EvictOne() TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
  const string name_;
  const bool has_size_limit_;
@ -123,13 +123,13 @@ class PoolAllocator : public Allocator {
  std::unique_ptr<SubAllocator> allocator_;
  std::unique_ptr<RoundUpInterface> size_rounder_;
  mutex mutex_;
-  std::multimap<const size_t, PtrRecord*> pool_ GUARDED_BY(mutex_);
+  std::multimap<const size_t, PtrRecord*> pool_ TF_GUARDED_BY(mutex_);
-  PtrRecord* lru_head_ GUARDED_BY(mutex_) = nullptr;
+  PtrRecord* lru_head_ TF_GUARDED_BY(mutex_) = nullptr;
-  PtrRecord* lru_tail_ GUARDED_BY(mutex_) = nullptr;
+  PtrRecord* lru_tail_ TF_GUARDED_BY(mutex_) = nullptr;
-  int64 get_from_pool_count_ GUARDED_BY(mutex_) = 0;
+  int64 get_from_pool_count_ TF_GUARDED_BY(mutex_) = 0;
-  int64 put_count_ GUARDED_BY(mutex_) = 0;
+  int64 put_count_ TF_GUARDED_BY(mutex_) = 0;
-  int64 allocated_count_ GUARDED_BY(mutex_) = 0;
+  int64 allocated_count_ TF_GUARDED_BY(mutex_) = 0;
-  int64 evicted_count_ GUARDED_BY(mutex_) = 0;
+  int64 evicted_count_ TF_GUARDED_BY(mutex_) = 0;
 };
 // Do-nothing rounder. Passes through sizes unchanged.
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@ -317,7 +317,7 @@ class ProcessFunctionLibraryRuntime {
  FunctionLibraryRuntime::Handle AddHandleLocked(
      const string& function_key, const string& device_name,
      FunctionLibraryRuntime::LocalHandle local_handle)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
  // For a given device_name, returns a DeviceContext for copying
  // tensors to/from the device.
@ -423,11 +423,11 @@ class ProcessFunctionLibraryRuntime {
    mutex mu_;
    const string target_device_;
-    FunctionLibraryRuntime::LocalHandle local_handle_ GUARDED_BY(mu_);
+    FunctionLibraryRuntime::LocalHandle local_handle_ TF_GUARDED_BY(mu_);
    const string function_key_;
-    bool is_cross_process_ GUARDED_BY(mu_) = false;
+    bool is_cross_process_ TF_GUARDED_BY(mu_) = false;
-    bool init_started_ GUARDED_BY(mu_) = false;
+    bool init_started_ TF_GUARDED_BY(mu_) = false;
-    Status init_result_ GUARDED_BY(mu_);
+    Status init_result_ TF_GUARDED_BY(mu_);
    Notification init_done_;
  };
@ -442,22 +442,22 @@ class ProcessFunctionLibraryRuntime {
  // Holds all the function instantiations. Maps function_keys to handles.
  std::unordered_map<string, FunctionLibraryRuntime::Handle> table_
-      GUARDED_BY(mu_);
+      TF_GUARDED_BY(mu_);
  // Function data for instantiated remote functions.
  std::unordered_map<FunctionLibraryRuntime::Handle,
                     std::unique_ptr<FunctionData>>
-      function_data_ GUARDED_BY(mu_);
+      function_data_ TF_GUARDED_BY(mu_);
  // Function data for instantiated multi-device functions.
  std::unordered_map<FunctionLibraryRuntime::Handle,
                     std::unique_ptr<MultiDeviceFunctionData>>
-      mdevice_data_ GUARDED_BY(mu_);
+      mdevice_data_ TF_GUARDED_BY(mu_);
  std::unique_ptr<
      std::unordered_map<Device*, std::unique_ptr<FunctionLibraryRuntime>>>
      flr_map_;
-  int next_handle_ GUARDED_BY(mu_);
+  int next_handle_ TF_GUARDED_BY(mu_);
  const SessionMetadata* const session_metadata_;
 };
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@ -74,7 +74,7 @@ class TestClusterFLR : public DistributedFunctionLibraryRuntime {
 private:
  mutex mu_;
-  int next_handle_ GUARDED_BY(mu_) = 0;
+  int next_handle_ TF_GUARDED_BY(mu_) = 0;
  DeviceMgr* device_mgr_;
 };
--- a/tensorflow/core/common_runtime/process_state.h
+++ b/tensorflow/core/common_runtime/process_state.h
@ -98,14 +98,14 @@ class ProcessState : public ProcessStateInterface {
  // Indexed by numa_node.  If we want numa-specific allocators AND a
  // non-specific allocator, maybe should index by numa_node+1.
-  std::vector<Allocator*> cpu_allocators_ GUARDED_BY(mu_);
+  std::vector<Allocator*> cpu_allocators_ TF_GUARDED_BY(mu_);
-  std::vector<SubAllocator::Visitor> cpu_alloc_visitors_ GUARDED_BY(mu_);
+  std::vector<SubAllocator::Visitor> cpu_alloc_visitors_ TF_GUARDED_BY(mu_);
-  std::vector<SubAllocator::Visitor> cpu_free_visitors_ GUARDED_BY(mu_);
+  std::vector<SubAllocator::Visitor> cpu_free_visitors_ TF_GUARDED_BY(mu_);
  // Optional RecordingAllocators that wrap the corresponding
  // Allocators for runtime attribute use analysis.
  MDMap mem_desc_map_;
-  std::vector<Allocator*> cpu_al_ GUARDED_BY(mu_);
+  std::vector<Allocator*> cpu_al_ TF_GUARDED_BY(mu_);
 };
 namespace internal {
--- a/tensorflow/core/common_runtime/ring_alg.h
+++ b/tensorflow/core/common_runtime/ring_alg.h
@ -102,8 +102,8 @@ class RingAlg : public CollectiveImplementationInterface {
   private:
    mutex pcq_mu_;
    condition_variable cv_;
-    int waiter_count_ GUARDED_BY(pcq_mu_) = 0;
+    int waiter_count_ TF_GUARDED_BY(pcq_mu_) = 0;
-    std::deque<RingField*> deque_ GUARDED_BY(pcq_mu_);
+    std::deque<RingField*> deque_ TF_GUARDED_BY(pcq_mu_);
  };
  const CollectiveType type_;
@ -117,7 +117,7 @@ class RingAlg : public CollectiveImplementationInterface {
  Notification group_size_tensor_ready_;
  std::unique_ptr<CollectiveAdapter> ca_;
  mutex status_mu_;
-  Status status_ GUARDED_BY(status_mu_);
+  Status status_ TF_GUARDED_BY(status_mu_);
  std::vector<RingField> rfv_;
 };
--- a/tensorflow/core/common_runtime/ring_gatherer_test.cc
+++ b/tensorflow/core/common_runtime/ring_gatherer_test.cc
@ -94,7 +94,7 @@ class FailTestRMA : public CollectiveRemoteAccessLocal {
  }
  mutex mu_;
-  int fail_after_ GUARDED_BY(mu_);
+  int fail_after_ TF_GUARDED_BY(mu_);
 };
 std::unique_ptr<OpKernel> GetKernel(const NodeDef& node,
@ -528,7 +528,7 @@ class RingGathererTest : public ::testing::Test {
  std::unique_ptr<tensorflow::DeviceMgr> dev_mgr_;
  std::unique_ptr<string> gpu_ring_order_;
  mutex mu_;
-  int32 gather_counter_ GUARDED_BY(mu_) = 0;
+  int32 gather_counter_ TF_GUARDED_BY(mu_) = 0;
 };
 CollectiveParams SetUpCollectiveParams(const int num_devs_per_task,
--- a/tensorflow/core/common_runtime/ring_reducer_test.cc
+++ b/tensorflow/core/common_runtime/ring_reducer_test.cc
@ -94,7 +94,7 @@ class FailTestRMA : public CollectiveRemoteAccessLocal {
  }
  mutex mu_;
-  int fail_after_ GUARDED_BY(mu_);
+  int fail_after_ TF_GUARDED_BY(mu_);
 };
 std::unique_ptr<OpKernel> GetKernel(const NodeDef& node,
@ -556,7 +556,7 @@ class RingReducerTest : public ::testing::Test {
  std::unique_ptr<tensorflow::DeviceMgr> dev_mgr_;
  std::unique_ptr<string> gpu_ring_order_;
  mutex mu_;
-  int32 reduce_counter_ GUARDED_BY(mu_) = 0;
+  int32 reduce_counter_ TF_GUARDED_BY(mu_) = 0;
 };
 CollectiveParams SetUpCollectiveParams(const int num_devs_per_task,
--- a/tensorflow/core/common_runtime/scoped_allocator.h
+++ b/tensorflow/core/common_runtime/scoped_allocator.h
@ -51,7 +51,7 @@ class ScopedAllocator {
  // Automatically deletes when last use expires, or when
  // ScopedAllocatorContainer decides to delete.
-  ~ScopedAllocator() LOCKS_EXCLUDED(mu_);
+  ~ScopedAllocator() TF_LOCKS_EXCLUDED(mu_);
  // For debugging: returns true iff p is a pointer that could have
  // been returned by AllocateRaw.
@ -66,8 +66,8 @@ class ScopedAllocator {
  friend class ScopedAllocatorInstance;
  // Only ScopedAllocatorInstances can call AllocateRaw and DeallocateRaw on a
  // ScopedAllocator
-  void* AllocateRaw(int32 field_index, size_t num_bytes) LOCKS_EXCLUDED(mu_);
+  void* AllocateRaw(int32 field_index, size_t num_bytes) TF_LOCKS_EXCLUDED(mu_);
-  void DeallocateRaw(void* p) LOCKS_EXCLUDED(mu_);
+  void DeallocateRaw(void* p) TF_LOCKS_EXCLUDED(mu_);
  Tensor backing_tensor_;
  TensorBuffer* tbuf_;
  int32 id_;
@ -75,8 +75,8 @@ class ScopedAllocator {
  ScopedAllocatorContainer* container_;
  std::vector<Field> fields_;
  mutex mu_;
-  int32 expected_call_count_ GUARDED_BY(mu_);
+  int32 expected_call_count_ TF_GUARDED_BY(mu_);
-  int32 live_alloc_count_ GUARDED_BY(mu_);
+  int32 live_alloc_count_ TF_GUARDED_BY(mu_);
 };
 // An Allocator that will return a pointer into the backing buffer of
@ -98,14 +98,14 @@ class ScopedAllocatorInstance : public Allocator {
  // When a ScopedAllocatorContainer "Drops" a scope_id, it calls DropFromTable
  // on the underlying ScopedAllocatorInstance.  If this instance has already
  // deallocated the tensor slice, we can safely delete this.
-  void DropFromTable() LOCKS_EXCLUDED(mu_);
+  void DropFromTable() TF_LOCKS_EXCLUDED(mu_);
  void* AllocateRaw(size_t alignment, size_t num_bytes)
-      LOCKS_EXCLUDED(mu_) override;
+      TF_LOCKS_EXCLUDED(mu_) override;
  void* AllocateRaw(size_t alignment, size_t num_bytes,
                    const AllocationAttributes& allocator_attr) override {
    return AllocateRaw(alignment, num_bytes);
  }
-  void DeallocateRaw(void* p) LOCKS_EXCLUDED(mu_) override;
+  void DeallocateRaw(void* p) TF_LOCKS_EXCLUDED(mu_) override;
  bool TracksAllocationSizes() const override { return false; }
  size_t RequestedSize(const void* ptr) const override { return 0; }
  size_t AllocatedSize(const void* ptr) const override { return 0; }
@ -117,9 +117,9 @@ class ScopedAllocatorInstance : public Allocator {
  mutex mu_;
  ScopedAllocator* scoped_allocator_;
  int32 field_index_;
-  bool allocated_ GUARDED_BY(mu_);
+  bool allocated_ TF_GUARDED_BY(mu_);
-  bool deallocated_ GUARDED_BY(mu_);
+  bool deallocated_ TF_GUARDED_BY(mu_);
-  bool in_table_ GUARDED_BY(mu_);
+  bool in_table_ TF_GUARDED_BY(mu_);
 };
 }  // namespace tensorflow
--- a/tensorflow/core/common_runtime/scoped_allocator_mgr.h
+++ b/tensorflow/core/common_runtime/scoped_allocator_mgr.h
@ -66,7 +66,7 @@ class ScopedAllocatorContainer : public core::RefCounted {
        : field_index(ScopedAllocator::kBackingIndex),
          scoped_allocator(nullptr) {}
  };
-  std::unordered_map<int32, SAField> allocators_ GUARDED_BY(mu_);
+  std::unordered_map<int32, SAField> allocators_ TF_GUARDED_BY(mu_);
 };
 // At most one of these exists per device.
@ -103,7 +103,7 @@ class ScopedAllocatorMgr {
  string device_name_;
  mutex mu_;
  std::unordered_map<int64, ScopedAllocatorContainer*> per_step_map_
-      GUARDED_BY(mu_);
+      TF_GUARDED_BY(mu_);
 };
 }  // namespace tensorflow
--- a/tensorflow/core/common_runtime/step_stats_collector.h
+++ b/tensorflow/core/common_runtime/step_stats_collector.h
@ -198,14 +198,14 @@ class StepStatsCollector : public StepStatsCollectorInterface {
  typedef std::vector<std::unique_ptr<NodeExecStatsWrapper>> NodeStatsVector;
  typedef std::unordered_map<uint32, string> ThreadNamesMap;
-  void FinalizeInternal() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  void FinalizeInternal() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
  mutex mu_;
-  bool finalized_ GUARDED_BY(mu_);
+  bool finalized_ TF_GUARDED_BY(mu_);
-  std::unordered_map<string, NodeStatsVector> dev_stats_ GUARDED_BY(mu_);
+  std::unordered_map<string, NodeStatsVector> dev_stats_ TF_GUARDED_BY(mu_);
-  std::unordered_map<string, ThreadNamesMap> thread_names_ GUARDED_BY(mu_);
+  std::unordered_map<string, ThreadNamesMap> thread_names_ TF_GUARDED_BY(mu_);
-  StepStats* step_stats_ GUARDED_BY(mu_);
+  StepStats* step_stats_ TF_GUARDED_BY(mu_);
-  uint64 collected_nodes_ GUARDED_BY(mu_) = 0;
+  uint64 collected_nodes_ TF_GUARDED_BY(mu_) = 0;
 };
 }  // namespace tensorflow
--- a/tensorflow/core/common_runtime/sycl/sycl_allocator.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_allocator.h
@ -64,8 +64,8 @@ class SYCLAllocator : public Allocator {
 private:
  mutable mutex mu_;
-  Eigen::SyclDevice* sycl_device_ GUARDED_BY(mu_);  // owned
+  Eigen::SyclDevice* sycl_device_ TF_GUARDED_BY(mu_);  // owned
-  AllocatorStats stats_ GUARDED_BY(mu_);
+  AllocatorStats stats_ TF_GUARDED_BY(mu_);
  TF_DISALLOW_COPY_AND_ASSIGN(SYCLAllocator);
 };
--- a/tensorflow/core/common_runtime/test_collective_executor_mgr.h
+++ b/tensorflow/core/common_runtime/test_collective_executor_mgr.h
@ -114,7 +114,7 @@ class TestCollectiveExecutorMgr : public CollectiveExecutorMgrInterface {
  void RetireStepId(int64 graph_key, int64 step_id) override {}
  mutex mu_;
-  gtl::FlatMap<int64, CollectiveExecutor*> table_ GUARDED_BY(mu_);
+  gtl::FlatMap<int64, CollectiveExecutor*> table_ TF_GUARDED_BY(mu_);
 };
 }  // namespace tensorflow
--- a/tensorflow/core/debug/debug_callback_registry.h
+++ b/tensorflow/core/debug/debug_callback_registry.h
@ -61,7 +61,7 @@ class DebugCallbackRegistry {
  mutex mu_;
  // Maps debug_url keys to callbacks for routing observed tensors.
-  std::map<string, EventCallback> keyed_callback_ GUARDED_BY(mu_);
+  std::map<string, EventCallback> keyed_callback_ TF_GUARDED_BY(mu_);
  static DebugCallbackRegistry* instance_;
 };
--- a/tensorflow/core/debug/debug_grpc_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_grpc_io_utils_test.cc
@ -192,8 +192,8 @@ TEST_F(GrpcDebugTest, SendMultipleDebugTensorsSynchronizedViaGrpcTest) {
  mutex mu;
  Notification all_done;
-  int tensor_count GUARDED_BY(mu) = 0;
+  int tensor_count TF_GUARDED_BY(mu) = 0;
-  std::vector<Status> statuses GUARDED_BY(mu);
+  std::vector<Status> statuses TF_GUARDED_BY(mu);
  const std::vector<string> urls({server_data_.url});
--- a/tensorflow/core/debug/debug_grpc_testlib.h
+++ b/tensorflow/core/debug/debug_grpc_testlib.h
@ -60,9 +60,9 @@ class TestEventListenerImpl final : public EventListener::Service {
  std::atomic_bool stop_requested_;
  std::atomic_bool stopped_;
-  std::vector<DebugNodeKey> debug_node_keys_ GUARDED_BY(states_mu_);
+  std::vector<DebugNodeKey> debug_node_keys_ TF_GUARDED_BY(states_mu_);
  std::vector<EventReply::DebugOpStateChange::State> new_states_
-      GUARDED_BY(states_mu_);
+      TF_GUARDED_BY(states_mu_);
  std::unordered_set<DebugNodeKey> write_enabled_debug_node_keys_;
--- a/tensorflow/core/debug/debug_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_io_utils_test.cc
@ -357,11 +357,11 @@ TEST_F(DebugIOUtilsTest, PublishTensorConcurrentlyToPartiallyOverlappingPaths) {
  const string dump_root_base = testing::TmpDir();
  mutex mu;
-  std::vector<string> dump_roots GUARDED_BY(mu);
+  std::vector<string> dump_roots TF_GUARDED_BY(mu);
-  std::vector<string> dump_file_paths GUARDED_BY(mu);
+  std::vector<string> dump_file_paths TF_GUARDED_BY(mu);
-  int dump_count GUARDED_BY(mu) = 0;
+  int dump_count TF_GUARDED_BY(mu) = 0;
-  int done_count GUARDED_BY(mu) = 0;
+  int done_count TF_GUARDED_BY(mu) = 0;
  Notification all_done;
  auto fn = [this, &dump_count, &done_count, &mu, &dump_root_base, &dump_roots,
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
@ -103,7 +103,7 @@ class BaseRendezvousMgr : public RendezvousMgrInterface {
  const WorkerEnv* const worker_env_;
  mutex mu_;
-  Table table_ GUARDED_BY(mu_);
+  Table table_ TF_GUARDED_BY(mu_);
  BaseRemoteRendezvous* FindOrCreate(int64 step_id);
@ -182,9 +182,9 @@ class BaseRemoteRendezvous : public RemoteRendezvous {
  mutable mutex init_mu_;
  // Status given by StartAbort() if any.
-  Status status_ GUARDED_BY(init_mu_);
+  Status status_ TF_GUARDED_BY(init_mu_);
-  WorkerSession* session_ GUARDED_BY(init_mu_);  // Not owned.
+  WorkerSession* session_ TF_GUARDED_BY(init_mu_);  // Not owned.
  // Data structures to handle calls when partially initialized.
  struct DeferredCall {
@ -193,16 +193,16 @@ class BaseRemoteRendezvous : public RemoteRendezvous {
    DeferredCall(const ParsedKey& parsed, DoneCallback done);
  };
-  std::vector<DeferredCall> deferred_calls_ GUARDED_BY(init_mu_);
+  std::vector<DeferredCall> deferred_calls_ TF_GUARDED_BY(init_mu_);
  typedef std::function<void()> InactiveCallback;
  // Active outstanding RecvTensor calls.
  mutex active_mu_;
  std::unordered_map<BaseRecvTensorCall*, InactiveCallback> active_
-      GUARDED_BY(active_mu_);
+      TF_GUARDED_BY(active_mu_);
-  bool is_initialized_locked() SHARED_LOCKS_REQUIRED(init_mu_) {
+  bool is_initialized_locked() TF_SHARED_LOCKS_REQUIRED(init_mu_) {
    return session_ != nullptr;
  }
--- a/tensorflow/core/distributed_runtime/call_options.h
+++ b/tensorflow/core/distributed_runtime/call_options.h
@ -68,10 +68,10 @@ class CallOptions {
 private:
  mutex mu_;
-  CancelFunction cancel_func_ GUARDED_BY(mu_);
+  CancelFunction cancel_func_ TF_GUARDED_BY(mu_);
  // RPC operation timeout in milliseconds.
-  int64 timeout_in_ms_ GUARDED_BY(mu_) = 0;
+  int64 timeout_in_ms_ TF_GUARDED_BY(mu_) = 0;
  TF_DISALLOW_COPY_AND_ASSIGN(CallOptions);
 };
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
@ -82,7 +82,7 @@ class ClusterFunctionLibraryRuntime : public DistributedFunctionLibraryRuntime {
          recv_keys(recv_keys) {}
  };
-  std::vector<FunctionData> function_data_ GUARDED_BY(mu_);
+  std::vector<FunctionData> function_data_ TF_GUARDED_BY(mu_);
 };
 }  // namespace tensorflow
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h
@ -48,11 +48,11 @@ class CollectiveParamResolverDistributed : public CollectiveParamResolverLocal {
 protected:
  // Returns true iff there's an entry for this group_key in the
  // local group_table_.
-  bool GroupIsCached(int32 group_key) LOCKS_EXCLUDED(group_mu_);
+  bool GroupIsCached(int32 group_key) TF_LOCKS_EXCLUDED(group_mu_);
  // Updates group_table_ with contents of resp.
  Status UpdateGroupCache(const CompleteGroupResponse& resp)
-      LOCKS_EXCLUDED(group_mu_);
+      TF_LOCKS_EXCLUDED(group_mu_);
  // Finds the GroupRec that corresponds to cp->group_key and also
  // populates cp->group from that GroupRec.
@ -65,13 +65,13 @@ class CollectiveParamResolverDistributed : public CollectiveParamResolverLocal {
  // Returns true iff there's an entry for this instance_key in the
  // local instance_table_.
-  bool InstanceIsCached(int32 instance_key) LOCKS_EXCLUDED(instance_mu_);
+  bool InstanceIsCached(int32 instance_key) TF_LOCKS_EXCLUDED(instance_mu_);
  // Updates instance_table_ with contents of resp.
  void UpdateInstanceCache(const GroupRec* gr, CollectiveParams* cp,
                           const CompleteInstanceResponse& resp,
                           const StatusCallback& done)
-      LOCKS_EXCLUDED(instance_mu_, gr->mu, group_mu_);
+      TF_LOCKS_EXCLUDED(instance_mu_, gr->mu, group_mu_);
  // Finish populating *cp.  Semantics are like those of
  // CompleteInstanceLocal but will make a remote call to the group
@ -80,7 +80,7 @@ class CollectiveParamResolverDistributed : public CollectiveParamResolverLocal {
                                   CollectiveParams* cp,
                                   CancellationManager* cancel_mgr,
                                   const StatusCallback& done)
-      LOCKS_EXCLUDED(instance_mu_, gr->mu, group_mu_);
+      TF_LOCKS_EXCLUDED(instance_mu_, gr->mu, group_mu_);
  WorkerCacheInterface* worker_cache_;  // Not owned
  const string group_leader_;
--- a/Show More
+++ b/Show More