diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc index a1bbcde94bd..3a49f6f3232 100644 --- a/tensorflow/core/common_runtime/direct_session.cc +++ b/tensorflow/core/common_runtime/direct_session.cc @@ -183,7 +183,7 @@ class DirectSessionFactory : public SessionFactory { // Must do this before the CPU allocator is created. if (options.config.graph_options().build_cost_model() > 0) { - EnableCPUAllocatorFullStats(true); + EnableCPUAllocatorFullStats(); } std::vector> devices; TF_RETURN_IF_ERROR(DeviceFactory::AddDevices( diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc index d20f779c8da..d032276d0f7 100644 --- a/tensorflow/core/framework/allocator.cc +++ b/tensorflow/core/framework/allocator.cc @@ -56,9 +56,7 @@ Allocator::~Allocator() {} // If true, cpu allocator collects full stats. static bool cpu_allocator_collect_full_stats = false; -void EnableCPUAllocatorFullStats(bool enable) { - cpu_allocator_collect_full_stats = enable; -} +void EnableCPUAllocatorFullStats() { cpu_allocator_collect_full_stats = true; } bool CPUAllocatorFullStatsEnabled() { return cpu_allocator_collect_full_stats; } string AllocatorAttributes::DebugString() const { diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h index 7b8eba0fda9..f7402f7b293 100644 --- a/tensorflow/core/framework/allocator.h +++ b/tensorflow/core/framework/allocator.h @@ -410,14 +410,17 @@ Allocator* cpu_allocator_base(); // call it directly. Allocator* cpu_allocator(int numa_node = port::kNUMANoAffinity); -// If 'enable' is true, the default CPU allocator implementation will collect -// AllocatorStats. By default, it's disabled. -void EnableCPUAllocatorStats(bool enable); +// Enables AllocatorStats in the default CPU allocator implementation. By +// default, it's disabled. +void EnableCPUAllocatorStats(); +// Disables AllocatorStats in the default CPU allocator implementation. By +// default, it's disabled. +void DisableCPUAllocatorStats(); bool CPUAllocatorStatsEnabled(); -// If 'enable' is true, the default CPU allocator implementation will collect -// full statistics. By default, it's disabled. -void EnableCPUAllocatorFullStats(bool enable); +// Enables full statistics collection in the default CPU allocator +// implementation. By default, it's disabled. +void EnableCPUAllocatorFullStats(); bool CPUAllocatorFullStatsEnabled(); // An object that does the underlying suballoc/free of memory for a higher-level diff --git a/tensorflow/core/framework/allocator_test.cc b/tensorflow/core/framework/allocator_test.cc index 3caab02eeba..0ac3da1a19c 100644 --- a/tensorflow/core/framework/allocator_test.cc +++ b/tensorflow/core/framework/allocator_test.cc @@ -133,7 +133,7 @@ TEST(AllocatorAttributesDeathTest, MergeDifferentScopeIds) { } TEST(CPUAllocatorTest, Simple) { - EnableCPUAllocatorStats(true); + EnableCPUAllocatorStats(); Allocator* a = cpu_allocator(); std::vector ptrs; for (int s = 1; s < 1024; s++) { @@ -162,7 +162,7 @@ TEST(CPUAllocatorTest, Simple) { 1048576 * sizeof(double)); a->ClearStats(); CheckStats(a, 0, 0, 0, 0); - EnableCPUAllocatorStats(false); + DisableCPUAllocatorStats(); } // Define a struct that we will use to observe behavior in the unit tests @@ -227,13 +227,13 @@ static void BM_Allocation(int iters, int arg) { std::vector sizes = {256, 4096, 16384, 524288, 512, 1048576}; int size_index = 0; - if (arg) EnableCPUAllocatorStats(true); + if (arg) EnableCPUAllocatorStats(); while (--iters > 0) { int bytes = sizes[size_index++ % sizes.size()]; void* p = a->AllocateRaw(1, bytes); a->DeallocateRaw(p); } - if (arg) EnableCPUAllocatorStats(false); + if (arg) DisableCPUAllocatorStats(); } BENCHMARK(BM_Allocation)->Arg(0)->Arg(1); diff --git a/tensorflow/core/framework/cpu_allocator_impl.cc b/tensorflow/core/framework/cpu_allocator_impl.cc index 814233074fb..511cfce8ab5 100644 --- a/tensorflow/core/framework/cpu_allocator_impl.cc +++ b/tensorflow/core/framework/cpu_allocator_impl.cc @@ -29,9 +29,8 @@ namespace tensorflow { // If true, cpu allocator collects more stats. static bool cpu_allocator_collect_stats = false; -void EnableCPUAllocatorStats(bool enable) { - cpu_allocator_collect_stats = enable; -} +void EnableCPUAllocatorStats() { cpu_allocator_collect_stats = true; } +void DisableCPUAllocatorStats() { cpu_allocator_collect_stats = false; } bool CPUAllocatorStatsEnabled() { return cpu_allocator_collect_stats; } static const int kMaxTotalAllocationWarnings = 1; diff --git a/tensorflow/core/grappler/clusters/cluster.h b/tensorflow/core/grappler/clusters/cluster.h index 0f415cf0392..d391e15de1a 100644 --- a/tensorflow/core/grappler/clusters/cluster.h +++ b/tensorflow/core/grappler/clusters/cluster.h @@ -103,9 +103,9 @@ class Cluster { // superset of the devices listed in GetDevices/GetDeviceNames(). virtual const DeviceSet* GetDeviceSet() const { return nullptr; } - // Enables collecting the allocator stats. Call with enable=true must be made - // before Provision(). - virtual Status EnablePeakMemoryStats(bool enable) { + // Enables collecting the allocator stats. If called, must be called before + // Provision(). + virtual Status EnablePeakMemoryStats() { return errors::Unimplemented(strings ::StrCat( "Peak Memory Stats are not supported on ", type(), " clusters")); } diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc index c44b74efcdc..678daed02e4 100644 --- a/tensorflow/core/grappler/clusters/single_machine.cc +++ b/tensorflow/core/grappler/clusters/single_machine.cc @@ -202,9 +202,9 @@ Status SingleMachine::Run(const GraphDef& graph_def, return Status::OK(); } -Status SingleMachine::EnablePeakMemoryStats(bool enable) { - EnableCPUAllocatorStats(enable); - cpu_allocator_stats_enabled_ = enable; +Status SingleMachine::EnablePeakMemoryStats() { + EnableCPUAllocatorStats(); + cpu_allocator_stats_enabled_ = true; // No need to enable GPU allocator stats since its stats are always collected. return Status::OK(); } diff --git a/tensorflow/core/grappler/clusters/single_machine.h b/tensorflow/core/grappler/clusters/single_machine.h index 9e085d161b6..48f56940ec4 100644 --- a/tensorflow/core/grappler/clusters/single_machine.h +++ b/tensorflow/core/grappler/clusters/single_machine.h @@ -45,7 +45,7 @@ class SingleMachine : public Cluster { const DeviceSet* GetDeviceSet() const override { return device_set_.get(); } - Status EnablePeakMemoryStats(bool enable) override; + Status EnablePeakMemoryStats() override; // It requires EnableAllocatorStats(true) be called before Provision(). Status GetPeakMemoryUsage( diff --git a/tensorflow/core/grappler/clusters/single_machine_test.cc b/tensorflow/core/grappler/clusters/single_machine_test.cc index a85e4e17748..d0d525e0222 100644 --- a/tensorflow/core/grappler/clusters/single_machine_test.cc +++ b/tensorflow/core/grappler/clusters/single_machine_test.cc @@ -51,7 +51,7 @@ class SingleMachineTest : public ::testing::Test { #endif cluster_.reset( new SingleMachine(timeout_s, 3 /* num_cpu_cores */, 0 /* num_gpus */)); - TF_CHECK_OK(cluster_->EnablePeakMemoryStats(true)); + TF_CHECK_OK(cluster_->EnablePeakMemoryStats()); TF_CHECK_OK(cluster_->Provision()); } diff --git a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc index 7a6924e2ebf..461fb7deb78 100644 --- a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc +++ b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc @@ -414,7 +414,7 @@ TEST(GraphTransferer, GraphTransferer gt; gt.EnableStrictCheckMode(false); - profile_utils::CpuUtils::EnableClockCycleProfiling(true); + profile_utils::CpuUtils::EnableClockCycleProfiling(); ClockCycleProfiler prof; prof.Start(); Status status = gt.LoadGraphFromProtoFile( @@ -447,7 +447,7 @@ TEST(GraphTransferer, GraphTransferer gt; gt.EnableStrictCheckMode(false); - profile_utils::CpuUtils::EnableClockCycleProfiling(true); + profile_utils::CpuUtils::EnableClockCycleProfiling(); ClockCycleProfiler prof; prof.Start(); Status status = gt.LoadGraphFromProtoFile( @@ -481,7 +481,7 @@ TEST(GraphTransferer, GraphTransferer gt; gt.EnableStrictCheckMode(false); - profile_utils::CpuUtils::EnableClockCycleProfiling(true); + profile_utils::CpuUtils::EnableClockCycleProfiling(); ClockCycleProfiler prof; prof.Start(); Status status = gt.LoadGraphFromProtoFile( @@ -540,7 +540,7 @@ TEST(GraphTransferer, DISABLED_RunInceptionV3OnHexagonExampleWithFusedGraph) { TEST(GraphTransferer, DISABLED_CheckShapeInferencePerformance) { CheckHexagonControllerVersion(); - profile_utils::CpuUtils::EnableClockCycleProfiling(true); + profile_utils::CpuUtils::EnableClockCycleProfiling(); const IRemoteFusedGraphOpsDefinitions* ops_definitions = &HexagonOpsDefinitions::getInstance(); diff --git a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc index 6dc1826d93b..d61a036181d 100644 --- a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc +++ b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc @@ -54,12 +54,11 @@ uint64 AndroidArmV7ACpuUtilsHelper::GetCurrentClockCycle() { return static_cast(count); } -void AndroidArmV7ACpuUtilsHelper::EnableClockCycleProfiling(const bool enable) { +void AndroidArmV7ACpuUtilsHelper::EnableClockCycleProfiling() { if (!is_initialized_) { // Initialize here to avoid unnecessary initialization InitializeInternal(); } - if (enable) { const int64 cpu0_scaling_min = ReadCpuFrequencyFile(0, "scaling_min"); const int64 cpu0_scaling_max = ReadCpuFrequencyFile(0, "scaling_max"); if (cpu0_scaling_max != cpu0_scaling_min) { @@ -69,9 +68,14 @@ void AndroidArmV7ACpuUtilsHelper::EnableClockCycleProfiling(const bool enable) { } ResetClockCycle(); ioctl(fd_, PERF_EVENT_IOC_ENABLE, 0); - } else { - ioctl(fd_, PERF_EVENT_IOC_DISABLE, 0); +} + +void AndroidArmV7ACpuUtilsHelper::DisableClockCycleProfiling() { + if (!is_initialized_) { + // Initialize here to avoid unnecessary initialization + InitializeInternal(); } + ioctl(fd_, PERF_EVENT_IOC_DISABLE, 0); } int64 AndroidArmV7ACpuUtilsHelper::CalculateCpuFrequency() { diff --git a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h index 2d94736c978..66bc0fd5928 100644 --- a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h +++ b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h @@ -36,7 +36,8 @@ class AndroidArmV7ACpuUtilsHelper : public ICpuUtilsHelper { AndroidArmV7ACpuUtilsHelper() = default; void ResetClockCycle() final; uint64 GetCurrentClockCycle() final; - void EnableClockCycleProfiling(bool enable) final; + void EnableClockCycleProfiling() final; + void DisableClockCycleProfiling() final; int64 CalculateCpuFrequency() final; private: diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.cc b/tensorflow/core/platform/profile_utils/cpu_utils.cc index b22123a804a..7cd1c4de88f 100644 --- a/tensorflow/core/platform/profile_utils/cpu_utils.cc +++ b/tensorflow/core/platform/profile_utils/cpu_utils.cc @@ -58,8 +58,12 @@ static ICpuUtilsHelper* cpu_utils_helper_instance_ = nullptr; GetCpuUtilsHelperSingletonInstance().ResetClockCycle(); } -/* static */ void CpuUtils::EnableClockCycleProfiling(const bool enable) { - GetCpuUtilsHelperSingletonInstance().EnableClockCycleProfiling(enable); +/* static */ void CpuUtils::EnableClockCycleProfiling() { + GetCpuUtilsHelperSingletonInstance().EnableClockCycleProfiling(); +} + +/* static */ void CpuUtils::DisableClockCycleProfiling() { + GetCpuUtilsHelperSingletonInstance().DisableClockCycleProfiling(); } /* static */ std::chrono::duration CpuUtils::ConvertClockCycleToTime( diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.h b/tensorflow/core/platform/profile_utils/cpu_utils.h index d26f28478a5..1132c485f90 100644 --- a/tensorflow/core/platform/profile_utils/cpu_utils.h +++ b/tensorflow/core/platform/profile_utils/cpu_utils.h @@ -138,9 +138,10 @@ class CpuUtils { // clock cycle counters from overflowing on some platforms. static void ResetClockCycle(); - // Enable clock cycle profile + // Enable/Disable clock cycle profile // You can enable / disable profile if it's supported by the platform - static void EnableClockCycleProfiling(bool enable); + static void EnableClockCycleProfiling(); + static void DisableClockCycleProfiling(); // Return chrono::duration per each clock static std::chrono::duration ConvertClockCycleToTime( @@ -152,7 +153,8 @@ class CpuUtils { DefaultCpuUtilsHelper() = default; void ResetClockCycle() final {} uint64 GetCurrentClockCycle() final { return DUMMY_CYCLE_CLOCK; } - void EnableClockCycleProfiling(bool /* enable */) final {} + void EnableClockCycleProfiling() final {} + void DisableClockCycleProfiling() final {} int64 CalculateCpuFrequency() final { return INVALID_FREQUENCY; } private: diff --git a/tensorflow/core/platform/profile_utils/cpu_utils_test.cc b/tensorflow/core/platform/profile_utils/cpu_utils_test.cc index eb8161fbfd5..a18561a1156 100644 --- a/tensorflow/core/platform/profile_utils/cpu_utils_test.cc +++ b/tensorflow/core/platform/profile_utils/cpu_utils_test.cc @@ -26,7 +26,7 @@ static constexpr bool DBG = false; class CpuUtilsTest : public ::testing::Test { protected: - void SetUp() override { CpuUtils::EnableClockCycleProfiling(true); } + void SetUp() override { CpuUtils::EnableClockCycleProfiling(); } }; TEST_F(CpuUtilsTest, SetUpTestCase) {} diff --git a/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h b/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h index cab7618a70a..bd63ffd0e85 100644 --- a/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h +++ b/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h @@ -35,9 +35,10 @@ class ICpuUtilsHelper { virtual void ResetClockCycle() = 0; // Return current clock cycle. virtual uint64 GetCurrentClockCycle() = 0; - // Enable clock cycle profile + // Enable/Disable clock cycle profile // You can enable / disable profile if it's supported by the platform - virtual void EnableClockCycleProfiling(bool enable) = 0; + virtual void EnableClockCycleProfiling() = 0; + virtual void DisableClockCycleProfiling() = 0; // Return cpu frequency. // CAVEAT: as this method may read file and/or call system calls, // this call is supposed to be slow.