try to deduplicate thread id. pthread_t is 8bytes, in some platform , casting them to 32bits might causing duplication.

1. when gettid is available, use that will give us a 32 bits unique value. 2. when gettid is not available, use hash rather than truncate give us better chance to have unique thread id. PiperOrigin-RevId: 316155614 Change-Id: I4fdbd43d22ef4420ca1b54efa8732d35ee35ffe1
2020-06-12 12:32:04 -07:00 · 2020-06-12 12:32:04 -07:00 · 0511d9fd95
commit 0511d9fd95
parent 798f7515f9
2 changed files with 20 additions and 22 deletions
--- a/tensorflow/core/platform/default/env.cc
+++ b/tensorflow/core/platform/default/env.cc
@ -135,15 +135,8 @@ class PosixEnv : public Env {
  }

  int32 GetCurrentThreadId() override {
-#ifdef __APPLE__
-    uint64_t tid64;
-    pthread_threadid_np(nullptr, &tid64);
-    return static_cast<int32>(tid64);
-#elif defined(__FreeBSD__)
-    return pthread_getthreadid_np();
-#else
-    return static_cast<int32>(pthread_self());
-#endif
+    static thread_local int32 current_thread_id = GetCurrentThreadIdInternal();
+    return current_thread_id;
  }

  bool GetCurrentThreadName(string* name) override {
@ -232,6 +225,20 @@ class PosixEnv : public Env {

 private:
  void GetLocalTempDirectories(std::vector<string>* list) override;
+
+  int32 GetCurrentThreadIdInternal() {
+#ifdef __APPLE__
+    uint64_t tid64;
+    pthread_threadid_np(nullptr, &tid64);
+    return static_cast<int32>(tid64);
+#elif defined(__FreeBSD__)
+    return pthread_getthreadid_np();
+#elif defined(__NR_gettid)
+    return static_cast<int32>(syscall(__NR_gettid));
+#else
+    return std::hash<std::thread::id>()(std::this_thread::get_id());
+#endif
+  }
 };

 }  // namespace
--- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
@ -126,15 +126,6 @@ const char *getActivityUnifiedMemoryKindString(
    }                                                                       \
  } while (false)

-// GetCachedTID() caches the thread ID in thread-local storage (which is a
-// userspace construct) to avoid unnecessary system calls. Without this caching,
-// it can take roughly 98ns, while it takes roughly 1ns with this caching.
-int32 GetCachedTID() {
-  static thread_local int32 current_thread_id =
-      Env::Default()->GetCurrentThreadId();
-  return current_thread_id;
-}
-
 size_t Bytes2D(const CUDA_MEMCPY2D *p) { return p->Height * p->WidthInBytes; }

 size_t Bytes3D(const CUDA_MEMCPY3D *p) {
@ -305,7 +296,7 @@ void AddKernelEventUponApiExit(CuptiTraceCollector *collector, uint32 device_id,
  event.name = cbdata->symbolName ? cbdata->symbolName : cbdata->functionName;
  event.start_time_ns = start_time;
  event.end_time_ns = end_time;
-  event.thread_id = GetCachedTID();
+  event.thread_id = Env::Default()->GetCurrentThreadId();
  event.device_id = device_id;
  event.context_id = cbdata->contextUid;
  event.correlation_id = cbdata->correlationId;
@ -323,7 +314,7 @@ CuptiTracerEvent PopulateMemcpyCallbackEvent(
  event.source = CuptiTracerEventSource::DriverCallback;
  event.start_time_ns = start_time;
  event.end_time_ns = end_time;
-  event.thread_id = GetCachedTID();
+  event.thread_id = Env::Default()->GetCurrentThreadId();
  event.device_id = src_device;
  event.context_id = cbdata->contextUid;
  event.correlation_id = cbdata->correlationId;
@ -387,7 +378,7 @@ void AddCudaMallocEventUponApiExit(CuptiTraceCollector *collector,
  event.name = cbdata->functionName;
  event.start_time_ns = start_time;
  event.end_time_ns = end_time;
-  event.thread_id = GetCachedTID();
+  event.thread_id = Env::Default()->GetCurrentThreadId();
  event.device_id = device_id;
  event.context_id = cbdata->contextUid;
  event.correlation_id = cbdata->correlationId;
@ -406,7 +397,7 @@ void AddGenericEventUponApiExit(CuptiTraceCollector *collector,
  event.name = cbdata->functionName;
  event.start_time_ns = start_time;
  event.end_time_ns = end_time;
-  event.thread_id = GetCachedTID();
+  event.thread_id = Env::Default()->GetCurrentThreadId();
  event.device_id = device_id;
  event.context_id = cbdata->contextUid;
  event.correlation_id = cbdata->correlationId;