diff --git a/tensorflow/core/profiler/internal/cpu/BUILD b/tensorflow/core/profiler/internal/cpu/BUILD index 717c3af7b63..8639ae6a098 100644 --- a/tensorflow/core/profiler/internal/cpu/BUILD +++ b/tensorflow/core/profiler/internal/cpu/BUILD @@ -109,6 +109,8 @@ tf_cc_test( "//tensorflow/core:lib", "//tensorflow/core:test", "//tensorflow/core:test_main", + "//tensorflow/core/profiler/utils:time_utils", + "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/strings", ], ) diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc b/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc index f3d128f4539..e9e86382fdb 100644 --- a/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc +++ b/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc @@ -39,14 +39,14 @@ void MakeCompleteEvents(TraceMeRecorder::Events* events) { // before the record created by ActivityEnd. Cross-thread events must be // processed in a separate pass. A single map can be used because the // activity_id is globally unique. - absl::flat_hash_map<uint64, TraceMeRecorder::Event*> start_events; + absl::flat_hash_map<int64, TraceMeRecorder::Event*> start_events; std::vector<TraceMeRecorder::Event*> end_events; for (auto& thread : *events) { for (auto& event : thread.events) { - if (IsStartEvent(event)) { - start_events.emplace(event.activity_id, &event); - } else if (IsEndEvent(event)) { - auto iter = start_events.find(event.activity_id); + if (event.IsStart()) { + start_events.emplace(event.ActivityId(), &event); + } else if (event.IsEnd()) { + auto iter = start_events.find(event.ActivityId()); if (iter != start_events.end()) { // same thread auto* start_event = iter->second; event.name = std::move(start_event->name); @@ -59,7 +59,7 @@ void MakeCompleteEvents(TraceMeRecorder::Events* events) { } } for (auto* event : end_events) { // cross-thread - auto iter = start_events.find(event->activity_id); + auto iter = start_events.find(event->ActivityId()); if (iter != start_events.end()) { auto* start_event = iter->second; event->name = std::move(start_event->name); @@ -79,7 +79,7 @@ void ConvertCompleteEventsToXPlane(uint64 start_timestamp_ns, xline.SetTimestampNs(start_timestamp_ns); xline.ReserveEvents(thread.events.size()); for (const auto& event : thread.events) { - if (!IsCompleteEvent(event)) continue; + if (!event.IsComplete()) continue; if (event.start_time < start_timestamp_ns) continue; Annotation annotation = ParseAnnotation(event.name); XEventMetadata* xevent_metadata = diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer_utils.h b/tensorflow/core/profiler/internal/cpu/host_tracer_utils.h index 5770058cc6d..7d069309452 100644 --- a/tensorflow/core/profiler/internal/cpu/host_tracer_utils.h +++ b/tensorflow/core/profiler/internal/cpu/host_tracer_utils.h @@ -22,22 +22,6 @@ limitations under the License. namespace tensorflow { namespace profiler { -// Returns true if event was created by TraceMe::ActivityStart. -inline bool IsStartEvent(const TraceMeRecorder::Event& event) { - return (event.start_time != 0) && (event.end_time == 0); -} - -// Returns true if event was created by TraceMe::ActivityEnd. -inline bool IsEndEvent(const TraceMeRecorder::Event& event) { - return (event.start_time == 0) && (event.end_time != 0); -} - -// Returns true if event was created by TraceMe::Stop or MakeCompleteEvents -// below. -inline bool IsCompleteEvent(const TraceMeRecorder::Event& event) { - return (event.start_time != 0) && (event.end_time != 0); -} - // Combine events created by TraceMe::ActivityStart and TraceMe::ActivityEnd, // which can be paired up by their activity_id. void MakeCompleteEvents(TraceMeRecorder::Events* events); diff --git a/tensorflow/core/profiler/internal/cpu/traceme_recorder.cc b/tensorflow/core/profiler/internal/cpu/traceme_recorder.cc index 57d36127493..a4b8bf5096a 100644 --- a/tensorflow/core/profiler/internal/cpu/traceme_recorder.cc +++ b/tensorflow/core/profiler/internal/cpu/traceme_recorder.cc @@ -254,7 +254,7 @@ bool TraceMeRecorder::StartRecording(int level) { return started; } -void TraceMeRecorder::Record(Event event) { +void TraceMeRecorder::Record(Event&& event) { static thread_local ThreadLocalRecorder thread_local_recorder; thread_local_recorder.Record(std::move(event)); } @@ -270,16 +270,16 @@ TraceMeRecorder::Events TraceMeRecorder::StopRecording() { return events; } -/*static*/ uint64 TraceMeRecorder::NewActivityId() { +/*static*/ int64 TraceMeRecorder::NewActivityId() { // Activity IDs: To avoid contention over a counter, the top 32 bits identify // the originating thread, the bottom 32 bits name the event within a thread. - // IDs may be reused after 4 billion events on one thread, or 4 billion + // IDs may be reused after 4 billion events on one thread, or 2 billion // threads. - static std::atomic<uint32> thread_counter(1); // avoid kUntracedActivity - const thread_local static uint32 thread_id = + static std::atomic<int32> thread_counter(1); // avoid kUntracedActivity + const thread_local static int32 thread_id = thread_counter.fetch_add(1, std::memory_order_relaxed); thread_local static uint32 per_thread_activity_id = 0; - return static_cast<uint64>(thread_id) << 32 | per_thread_activity_id++; + return static_cast<int64>(thread_id) << 32 | per_thread_activity_id++; } } // namespace profiler diff --git a/tensorflow/core/profiler/internal/cpu/traceme_recorder.h b/tensorflow/core/profiler/internal/cpu/traceme_recorder.h index 69246ac4aa9..4aab673e2d7 100644 --- a/tensorflow/core/profiler/internal/cpu/traceme_recorder.h +++ b/tensorflow/core/profiler/internal/cpu/traceme_recorder.h @@ -44,18 +44,29 @@ TF_EXPORT extern std::atomic<int> g_trace_level; // // This is the backend for TraceMe instrumentation. // The profiler starts the recorder, the TraceMe destructor records complete -// events. TraceMe::ActivityStart records begin events, and TraceMe::ActivityEnd +// events. TraceMe::ActivityStart records start events, and TraceMe::ActivityEnd // records end events. The profiler then stops the recorder and finds start/end // pairs. (Unpaired start/end events are discarded at that point). class TraceMeRecorder { public: // An Event is either the start of a TraceMe, the end of a TraceMe, or both. // Times are in ns since the Unix epoch. + // A negative time encodes the activity_id used to pair up the start of an + // event with its end. struct Event { - uint64 activity_id; + bool IsComplete() const { return start_time > 0 && end_time > 0; } + bool IsStart() const { return end_time < 0; } + bool IsEnd() const { return start_time < 0; } + + int64 ActivityId() const { + if (IsStart()) return -end_time; + if (IsEnd()) return -start_time; + return 1; // complete + } + std::string name; - uint64 start_time; // 0 = missing - uint64 end_time; // 0 = missing + int64 start_time; + int64 end_time; }; struct ThreadInfo { uint32 tid; @@ -85,10 +96,10 @@ class TraceMeRecorder { static constexpr int kTracingDisabled = -1; // Records an event. Non-blocking. - static void Record(Event event); + static void Record(Event&& event); // Returns an activity_id for TraceMe::ActivityStart. - static uint64 NewActivityId(); + static int64 NewActivityId(); private: class ThreadLocalRecorder; diff --git a/tensorflow/core/profiler/internal/cpu/traceme_recorder_test.cc b/tensorflow/core/profiler/internal/cpu/traceme_recorder_test.cc index b6ab78ed546..e4f0f6d6a48 100644 --- a/tensorflow/core/profiler/internal/cpu/traceme_recorder_test.cc +++ b/tensorflow/core/profiler/internal/cpu/traceme_recorder_test.cc @@ -15,12 +15,13 @@ limitations under the License. #include "tensorflow/core/profiler/internal/cpu/traceme_recorder.h" #include <atomic> -#include <istream> #include <set> #include <string> #include <utility> #include <vector> +#include "absl/container/flat_hash_map.h" +#include "absl/strings/numbers.h" #include "absl/strings/str_cat.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/env_time.h" @@ -29,6 +30,7 @@ limitations under the License. #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/threadpool.h" #include "tensorflow/core/platform/types.h" +#include "tensorflow/core/profiler/utils/time_utils.h" namespace tensorflow { namespace profiler { @@ -41,15 +43,15 @@ MATCHER_P(Named, name, "") { return arg.name == name; } constexpr static uint64 kNanosInSec = 1000000000; TEST(RecorderTest, SingleThreaded) { - uint64 start_time = Env::Default()->NowNanos(); - uint64 end_time = start_time + kNanosInSec; + int64 start_time = GetCurrentTimeNanos(); + int64 end_time = start_time + kNanosInSec; - TraceMeRecorder::Record({1, "before", start_time, end_time}); + TraceMeRecorder::Record({"before", start_time, end_time}); TraceMeRecorder::Start(/*level=*/1); - TraceMeRecorder::Record({2, "during1", start_time, end_time}); - TraceMeRecorder::Record({3, "during2", start_time, end_time}); + TraceMeRecorder::Record({"during1", start_time, end_time}); + TraceMeRecorder::Record({"during2", start_time, end_time}); auto results = TraceMeRecorder::Stop(); - TraceMeRecorder::Record({4, "after", start_time, end_time}); + TraceMeRecorder::Record({"after", start_time, end_time}); ASSERT_EQ(results.size(), 1); EXPECT_THAT(results[0].events, @@ -83,15 +85,14 @@ TEST(RecorderTest, Multithreaded) { thread::ThreadPool pool(Env::Default(), "testpool", kNumThreads); std::atomic<int> thread_count = {0}; for (int i = 0; i < kNumThreads; i++) { - pool.Schedule([&start, &stop, &thread_count, i] { + pool.Schedule([&start, &stop, &thread_count] { uint64 j = 0; bool was_active = false; - auto record_event = [&j, i]() { - uint64 start_time = Env::Default()->NowNanos(); - uint64 end_time = start_time + kNanosInSec; - TraceMeRecorder::Record({/*activity_id=*/j++, - /*name=*/absl::StrCat(i), start_time, - end_time}); + auto record_event = [&j]() { + int64 start_time = GetCurrentTimeNanos(); + int64 end_time = start_time + kNanosInSec; + TraceMeRecorder::Record( + {/*name=*/absl::StrCat(j++), start_time, end_time}); }; thread_count.fetch_add(1, std::memory_order_relaxed); start.WaitForNotification(); @@ -121,15 +122,17 @@ TEST(RecorderTest, Multithreaded) { } // For each thread, keep track of which events we've seen. - struct { + struct ThreadState { bool split_session = false; bool overlapping_sessions = false; std::set<uint64> events; - } thread_state[kNumThreads]; + }; + absl::flat_hash_map<uint32 /*tid*/, ThreadState> thread_state; // We expect each thread to eventually have multiple events, not all in a // contiguous range. auto done = [&thread_state] { - for (const auto& t : thread_state) { + for (const auto& id_and_thread : thread_state) { + auto& t = id_and_thread.second; if (t.events.size() < 2) return false; } return true; @@ -153,20 +156,19 @@ TEST(RecorderTest, Multithreaded) { auto results = TraceMeRecorder::Stop(); for (const auto& thread : results) { if (thread.events.empty()) continue; - std::istringstream ss(thread.events.front().name); - int thread_index = 0; - ss >> thread_index; - auto& state = thread_state[thread_index]; + auto& state = thread_state[thread.thread.tid]; std::set<uint64> session_events; uint64 current = 0; for (const auto& event : thread.events) { - session_events.emplace(event.activity_id); + uint64 activity_id; + ASSERT_TRUE(absl::SimpleAtoi(event.name, &activity_id)); + session_events.emplace(activity_id); // Session events should be contiguous. - if (current != 0 && event.activity_id != current + 1) { + if (current != 0 && activity_id != current + 1) { state.split_session = true; } - current = event.activity_id; + current = activity_id; } for (const auto& event : session_events) { @@ -182,7 +184,8 @@ TEST(RecorderTest, Multithreaded) { } stop.Notify(); - for (const auto& thread : thread_state) { + for (const auto& id_and_thread : thread_state) { + auto& thread = id_and_thread.second; EXPECT_FALSE(thread.split_session) << "Expected contiguous events in a session"; EXPECT_FALSE(thread.overlapping_sessions) << "Expected disjoint sessions"; diff --git a/tensorflow/core/profiler/lib/traceme.h b/tensorflow/core/profiler/lib/traceme.h index 5f48e1effa9..9162794854f 100644 --- a/tensorflow/core/profiler/lib/traceme.h +++ b/tensorflow/core/profiler/lib/traceme.h @@ -174,8 +174,8 @@ class TraceMe { #if !defined(IS_MOBILE_PLATFORM) if (TF_PREDICT_FALSE(start_time_ != kUntracedActivity)) { if (TF_PREDICT_TRUE(TraceMeRecorder::Active())) { - TraceMeRecorder::Record({kCompleteActivity, std::move(no_init_.name), - start_time_, GetCurrentTimeNanos()}); + TraceMeRecorder::Record( + {std::move(no_init_.name), start_time_, GetCurrentTimeNanos()}); } no_init_.name.~string(); start_time_ = kUntracedActivity; @@ -210,12 +210,12 @@ class TraceMe { // Returns the activity ID, which is used to stop the activity. // Calls `name_generator` to get the name for activity. template <typename NameGeneratorT> - static uint64 ActivityStart(NameGeneratorT name_generator, int level = 1) { + static int64 ActivityStart(NameGeneratorT name_generator, int level = 1) { #if !defined(IS_MOBILE_PLATFORM) if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level))) { - uint64 activity_id = TraceMeRecorder::NewActivityId(); - TraceMeRecorder::Record({activity_id, name_generator(), - GetCurrentTimeNanos(), /*end_time=*/0}); + int64 activity_id = TraceMeRecorder::NewActivityId(); + TraceMeRecorder::Record( + {name_generator(), GetCurrentTimeNanos(), -activity_id}); return activity_id; } #endif @@ -224,12 +224,12 @@ class TraceMe { // Record the start time of an activity. // Returns the activity ID, which is used to stop the activity. - static uint64 ActivityStart(absl::string_view name, int level = 1) { + static int64 ActivityStart(absl::string_view name, int level = 1) { #if !defined(IS_MOBILE_PLATFORM) if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level))) { - uint64 activity_id = TraceMeRecorder::NewActivityId(); - TraceMeRecorder::Record({activity_id, std::string(name), - GetCurrentTimeNanos(), /*end_time=*/0}); + int64 activity_id = TraceMeRecorder::NewActivityId(); + TraceMeRecorder::Record( + {std::string(name), GetCurrentTimeNanos(), -activity_id}); return activity_id; } #endif @@ -237,23 +237,23 @@ class TraceMe { } // Same as ActivityStart above, an overload for "const std::string&" - static uint64 ActivityStart(const std::string& name, int level = 1) { + static int64 ActivityStart(const std::string& name, int level = 1) { return ActivityStart(absl::string_view(name), level); } // Same as ActivityStart above, an overload for "const char*" - static uint64 ActivityStart(const char* name, int level = 1) { + static int64 ActivityStart(const char* name, int level = 1) { return ActivityStart(absl::string_view(name), level); } // Record the end time of an activity started by ActivityStart(). - static void ActivityEnd(uint64 activity_id) { + static void ActivityEnd(int64 activity_id) { #if !defined(IS_MOBILE_PLATFORM) // We don't check the level again (see TraceMe::Stop()). if (TF_PREDICT_FALSE(activity_id != kUntracedActivity)) { if (TF_PREDICT_TRUE(TraceMeRecorder::Active())) { - TraceMeRecorder::Record({activity_id, /*name=*/std::string(), - /*start_time=*/0, GetCurrentTimeNanos()}); + TraceMeRecorder::Record( + {std::string(), -activity_id, GetCurrentTimeNanos()}); } } #endif @@ -264,9 +264,9 @@ class TraceMe { static void InstantActivity(NameGeneratorT name_generator, int level = 1) { #if !defined(IS_MOBILE_PLATFORM) if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level))) { - uint64 now = GetCurrentTimeNanos(); - TraceMeRecorder::Record({kCompleteActivity, name_generator(), - /*start_time=*/now, /*end_time=*/now}); + int64 now = GetCurrentTimeNanos(); + TraceMeRecorder::Record( + {name_generator(), /*start_time=*/now, /*end_time=*/now}); } #endif } @@ -279,7 +279,7 @@ class TraceMe { #endif } - static uint64 NewActivityId() { + static int64 NewActivityId() { #if !defined(IS_MOBILE_PLATFORM) return TraceMeRecorder::NewActivityId(); #else @@ -288,10 +288,8 @@ class TraceMe { } private: - // Activity ID or start time used when tracing is disabled. - constexpr static uint64 kUntracedActivity = 0; - // Activity ID used as a placeholder when both start and end are present. - constexpr static uint64 kCompleteActivity = 1; + // Start time used when tracing is disabled. + constexpr static int64 kUntracedActivity = 0; TF_DISALLOW_COPY_AND_ASSIGN(TraceMe); @@ -303,7 +301,7 @@ class TraceMe { std::string name; } no_init_; - uint64 start_time_ = kUntracedActivity; + int64 start_time_ = kUntracedActivity; }; // Whether OpKernel::TraceString will populate additional information for diff --git a/tensorflow/core/profiler/utils/time_utils.cc b/tensorflow/core/profiler/utils/time_utils.cc index b6839d32ba3..f94e0a0e1a0 100644 --- a/tensorflow/core/profiler/utils/time_utils.cc +++ b/tensorflow/core/profiler/utils/time_utils.cc @@ -20,7 +20,7 @@ limitations under the License. namespace tensorflow { namespace profiler { -uint64 GetCurrentTimeNanos() { +int64 GetCurrentTimeNanos() { // absl::GetCurrentTimeNanos() is much faster than EnvTime::NowNanos(). // It is wrapped under tensorflow::profiler::GetCurrentTimeNanos to avoid ODR // violation and to allow switching to yet another implementation if required. diff --git a/tensorflow/core/profiler/utils/time_utils.h b/tensorflow/core/profiler/utils/time_utils.h index aafbcfe69f7..f5a30364b1d 100644 --- a/tensorflow/core/profiler/utils/time_utils.h +++ b/tensorflow/core/profiler/utils/time_utils.h @@ -38,7 +38,7 @@ inline double MillisToSeconds(double ms) { return ms / 1E3; } inline uint64 SecondsToNanos(double s) { return s * 1E9; } // Returns the current CPU wallclock time in nanoseconds. -uint64 GetCurrentTimeNanos(); +int64 GetCurrentTimeNanos(); } // namespace profiler } // namespace tensorflow