Implement a OpStats combiner.
PiperOrigin-RevId: 329773957 Change-Id: I1c93da3e3e527addd4d13a4db96c4ed3f831dd59
This commit is contained in:
parent
1b83992ade
commit
edb454c157
@ -540,22 +540,3 @@ tf_cc_test(
|
||||
"@com_google_absl//absl/strings",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "op_stats_combiner",
|
||||
srcs = ["op_stats_combiner.cc"],
|
||||
hdrs = ["op_stats_combiner.h"],
|
||||
deps = [
|
||||
":op_metrics_db_combiner",
|
||||
":xplane_to_tf_functions",
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/core/profiler/protobuf:diagnostics_proto_cc",
|
||||
"//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
|
||||
"//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
|
||||
"//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
|
||||
"//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
|
||||
"//tensorflow/core/profiler/utils:hardware_type_utils",
|
||||
"//tensorflow/core/profiler/utils:step_interval",
|
||||
"@com_google_absl//absl/container:flat_hash_map",
|
||||
],
|
||||
)
|
||||
|
@ -1,173 +0,0 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/core/profiler/convert/op_stats_combiner.h"
|
||||
|
||||
#include "absl/container/flat_hash_map.h"
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
#include "tensorflow/core/platform/macros.h"
|
||||
#include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
|
||||
#include "tensorflow/core/profiler/convert/xplane_to_tf_functions.h"
|
||||
#include "tensorflow/core/profiler/protobuf/diagnostics.pb.h"
|
||||
#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
|
||||
#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
|
||||
#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
|
||||
#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
|
||||
#include "tensorflow/core/profiler/utils/hardware_type_utils.h"
|
||||
#include "tensorflow/core/profiler/utils/step_interval.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace profiler {
|
||||
|
||||
namespace {
|
||||
|
||||
// Combines the src PerCoreStepInfo into the dst PerCoreStepInfo.
|
||||
void CombinePerCoreStepInfo(
|
||||
int src_host_id, bool use_incomplete_step, const PerCoreStepInfo& src,
|
||||
PerCoreStepInfo* dst,
|
||||
OpMetricsDbCombiner* hlo_metrics_db_complete_steps_only_combiner,
|
||||
OpMetricsDbCombiner* hlo_metrics_db_per_step_combiner) {
|
||||
DCHECK_EQ(dst->step_num(), src.step_num());
|
||||
CombineCoreIdMap(src_host_id, src.step_info_per_core(),
|
||||
dst->mutable_step_info_per_core());
|
||||
if (!use_incomplete_step) {
|
||||
hlo_metrics_db_complete_steps_only_combiner->Combine(src.hlo_metrics_db());
|
||||
}
|
||||
hlo_metrics_db_per_step_combiner->Combine(src.hlo_metrics_db());
|
||||
CombineCoreIdMap(src_host_id, src.flow_db_per_core(),
|
||||
dst->mutable_flow_db_per_core());
|
||||
CombineCoreIdMap(src_host_id, src.all_reduce_db_per_core(),
|
||||
dst->mutable_all_reduce_db_per_core());
|
||||
CombineCoreIdMap(src_host_id, src.core_id_to_replica_id_map(),
|
||||
dst->mutable_core_id_to_replica_id_map());
|
||||
}
|
||||
|
||||
void CombineStepDatabase(
|
||||
int src_host_id, StepInterval step_intersection,
|
||||
const StepDatabaseResult& src, StepDatabaseResult* dst,
|
||||
OpMetricsDbCombiner* hlo_metrics_db_complete_steps_only_combiner,
|
||||
std::vector<OpMetricsDbCombiner>* hlo_metrics_db_per_step_combiners) {
|
||||
if (src.use_incomplete_step()) {
|
||||
dst->set_use_incomplete_step(true);
|
||||
}
|
||||
for (const PerCoreStepInfo& src_step_info : src.step_sequence()) {
|
||||
uint32 step_num = src_step_info.step_num();
|
||||
if (!step_intersection.Contains(step_num)) {
|
||||
continue;
|
||||
}
|
||||
uint32 dst_step_sequence_index = step_intersection.Index(step_num);
|
||||
CombinePerCoreStepInfo(
|
||||
src_host_id, src.use_incomplete_step(), src_step_info,
|
||||
dst->mutable_step_sequence(dst_step_sequence_index),
|
||||
hlo_metrics_db_complete_steps_only_combiner,
|
||||
&(*hlo_metrics_db_per_step_combiners)[dst_step_sequence_index]);
|
||||
}
|
||||
}
|
||||
|
||||
void CombineRunEnvironment(const RunEnvironment& src, RunEnvironment* dst) {
|
||||
dst->mutable_hostnames()->insert(src.hostnames().begin(),
|
||||
src.hostnames().end());
|
||||
dst->set_host_count(dst->hostnames_size());
|
||||
if (src.device_type() != "CPU") {
|
||||
dst->set_device_type(src.device_type());
|
||||
// TODO(b/111402648): Batch size may differ per-core. Currently, we report
|
||||
// the max batch size. We need to come up with a better measure.
|
||||
dst->set_per_core_batch_size(
|
||||
std::max(src.per_core_batch_size(), dst->per_core_batch_size()));
|
||||
dst->set_device_core_count(src.device_core_count() +
|
||||
dst->device_core_count());
|
||||
// Replica count and num cores per replica must be same for all copies.
|
||||
dst->set_replica_count(std::max(src.replica_count(), dst->replica_count()));
|
||||
dst->set_num_cores_per_replica(
|
||||
std::max(src.num_cores_per_replica(), dst->num_cores_per_replica()));
|
||||
*dst->mutable_topology() = src.topology();
|
||||
}
|
||||
dst->set_task_count(src.task_count() + dst->task_count());
|
||||
(*dst->mutable_host_independent_job_info()) = src.host_independent_job_info();
|
||||
for (const auto& job_info : src.host_dependent_job_info()) {
|
||||
*(dst->add_host_dependent_job_info()) = job_info;
|
||||
}
|
||||
dst->set_host_trace_level(src.host_trace_level());
|
||||
}
|
||||
|
||||
// Combines the src PerfEnv into the dst PerfEnv.
|
||||
void CombinePerfEnv(const PerfEnv& src, PerfEnv* dst) {
|
||||
dst->set_peak_tera_flops_per_second(src.peak_tera_flops_per_second());
|
||||
dst->set_peak_hbm_bw_giga_bytes_per_second(
|
||||
src.peak_hbm_bw_giga_bytes_per_second());
|
||||
dst->set_ridge_point(src.ridge_point());
|
||||
}
|
||||
|
||||
// Combines the src Diagnostics into the dst Diagnostics.
|
||||
void CombineDiagnostics(const Diagnostics& src, Diagnostics* dst) {
|
||||
dst->mutable_info()->MergeFrom(src.info());
|
||||
dst->mutable_warnings()->MergeFrom(src.warnings());
|
||||
dst->mutable_errors()->MergeFrom(src.errors());
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
bool IsCoordinator(bool no_accelerator_in_system, HardwareType hardware_type) {
|
||||
// A host is a coordinator if:
|
||||
// (1) The host doesn't have a device, and
|
||||
// (2) The system does use accelerator (if not, it uses CPU only and so this
|
||||
// host should be regarded as a worker as well).
|
||||
return !HasDevice(hardware_type) && !no_accelerator_in_system;
|
||||
}
|
||||
|
||||
uint32 GlobalCoreId(int host_id, uint32 device_ordinal) {
|
||||
constexpr uint32 kMaxDevicesPerHost = 1000; // power-of-10 for debuggability
|
||||
return host_id * kMaxDevicesPerHost + device_ordinal;
|
||||
}
|
||||
|
||||
void CombineOpStats(
|
||||
bool no_accelerator_in_system, int src_host_id, HardwareType hardware_type,
|
||||
StepInterval step_intersection, const OpStats& src, OpStats* dst,
|
||||
OpMetricsDbCombiner* host_op_metrics_db_combiner,
|
||||
OpMetricsDbCombiner* device_op_metrics_db_combiner,
|
||||
OpMetricsDbCombiner* hlo_metrics_db_complete_steps_only_combiner,
|
||||
std::vector<OpMetricsDbCombiner>* hlo_metrics_db_per_step_combiners) {
|
||||
// Combine host_metrics_db.
|
||||
host_op_metrics_db_combiner->Combine(src.host_op_metrics_db());
|
||||
// Combine device_metrics_db.
|
||||
device_op_metrics_db_combiner->Combine(src.device_op_metrics_db());
|
||||
|
||||
// Combine step_db.
|
||||
if (!IsCoordinator(no_accelerator_in_system, hardware_type)) {
|
||||
CombineStepDatabase(src_host_id, step_intersection, src.step_db(),
|
||||
dst->mutable_step_db(),
|
||||
hlo_metrics_db_complete_steps_only_combiner,
|
||||
hlo_metrics_db_per_step_combiners);
|
||||
}
|
||||
|
||||
// Combine run environment info.
|
||||
CombineRunEnvironment(src.run_environment(), dst->mutable_run_environment());
|
||||
|
||||
// Combine the perf environment info.
|
||||
CombinePerfEnv(src.perf_env(), dst->mutable_perf_env());
|
||||
|
||||
// Combine diagnostics.
|
||||
CombineDiagnostics(src.diagnostics(), dst->mutable_diagnostics());
|
||||
|
||||
// Combine kernel stats.
|
||||
dst->mutable_kernel_stats_db()->mutable_reports()->MergeFrom(
|
||||
src.kernel_stats_db().reports());
|
||||
|
||||
// Combine tf-function stats.
|
||||
CombineTfFunctionDb(src.tf_function_db(), dst->mutable_tf_function_db());
|
||||
}
|
||||
|
||||
} // namespace profiler
|
||||
} // namespace tensorflow
|
@ -1,65 +0,0 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_COMBINER_H_
|
||||
#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_COMBINER_H_
|
||||
|
||||
#include "absl/container/flat_hash_map.h"
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
#include "tensorflow/core/platform/macros.h"
|
||||
#include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
|
||||
#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
|
||||
#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
|
||||
#include "tensorflow/core/profiler/utils/step_interval.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace profiler {
|
||||
|
||||
// Whether a host is a coordinator.
|
||||
bool IsCoordinator(bool no_accelerator_in_system, HardwareType hardware_type);
|
||||
|
||||
// Translates the core id from single host to the one for multiple-host.
|
||||
// We need this translation because the device_ordinal was assigned when a
|
||||
// single host response was given. Now, we need a global core_id to distinguish
|
||||
// it with multiple hosts.
|
||||
uint32 GlobalCoreId(int host_id, uint32 device_ordinal);
|
||||
|
||||
// Combines the src map into the dst map.
|
||||
// The src map keys are local core_ids. The src_host_id is used to convert them
|
||||
// into global core_ids used as keys in the dst map.
|
||||
// REQUIRED: cores from src_host_id are not already in dst.
|
||||
template <typename T>
|
||||
void CombineCoreIdMap(int src_host_id, const proto2::Map<uint32, T>& src,
|
||||
proto2::Map<uint32, T>* dst) {
|
||||
for (const auto& [core_id, value] : src) {
|
||||
uint32 global_core_id = GlobalCoreId(src_host_id, core_id);
|
||||
auto [iter, was_inserted] = dst->insert({global_core_id, value});
|
||||
DCHECK(was_inserted) << "Duplicated core_id: " << iter->first;
|
||||
}
|
||||
}
|
||||
|
||||
// Combine the src OpStats into the dst OpStats.
|
||||
void CombineOpStats(
|
||||
bool no_accelerator_in_system, int src_host_id, HardwareType hardware_type,
|
||||
StepInterval step_intersection, const OpStats& src, OpStats* dst,
|
||||
OpMetricsDbCombiner* host_op_metrics_db_combiner,
|
||||
OpMetricsDbCombiner* device_op_metrics_db_combiner,
|
||||
OpMetricsDbCombiner* hlo_metrics_db_complete_steps_only_combiner,
|
||||
std::vector<OpMetricsDbCombiner>* hlo_metrics_db_per_step_combiners);
|
||||
|
||||
} // namespace profiler
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_COMBINER_H_
|
@ -82,7 +82,5 @@ HardwareType ParseHardwareType(absl::string_view device_type) {
|
||||
return HardwareType::UNKNOWN_HARDWARE;
|
||||
}
|
||||
|
||||
bool HasDevice(HardwareType x) { return x > tensorflow::profiler::CPU_ONLY; }
|
||||
|
||||
} // namespace profiler
|
||||
} // namespace tensorflow
|
||||
|
@ -28,9 +28,6 @@ double GetFlopMaxThroughputPerSM(const DeviceCapabilities& device_cap);
|
||||
|
||||
HardwareType ParseHardwareType(absl::string_view device_type);
|
||||
|
||||
// Returns true if the given hardware type has a device.
|
||||
bool HasDevice(HardwareType x);
|
||||
|
||||
} // namespace profiler
|
||||
} // namespace tensorflow
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user