From a8950d70bfe0405fa405127cf5fd824a7a778aac Mon Sep 17 00:00:00 2001 From: Shanqing Cai Date: Mon, 15 Jun 2020 13:49:39 -0700 Subject: [PATCH] [tfdbg2] Add tfdbg_run_id to metadata of data dumps - A data dump file set generated by tfdbg2 can contain multiple subsets when there are multiple hosts involved in the instrumented TensorFlow job (e.g., TPUs and Parameter Servers). Currently, there is no bit in those subset of files that indicates they belong to the same instrumented TF job. - This CL addresses this problem by adding a field to the metadata proto used by those files (`tfdbg_run_id`) - The DebugEventsWriter code is revised, so that this new field is written to the metadata file of the file set on the writer's construction. - Also in this CL: remove the previous 1-arg `GetDebugEventsWriter(dump_root)` that creates the writer object if it doesn't exist at the specified dump_root. Replace it with `LookUpDebugEventsWriter(dump_root)` that only looks up the writer object and returns a non-OK status if such an object hasn't been created at `dump_root`. This makes the code less error prone by keeping only the fully-explicit, 3-arg `GetDebugEventsWriter()`. PiperOrigin-RevId: 316537044 Change-Id: Id5be0b771fbf37c0fc796f1514ed858a0e6d38f0 --- tensorflow/core/kernels/debug_ops.h | 11 +- tensorflow/core/ops/debug_ops.cc | 1 + tensorflow/core/protobuf/debug_event.proto | 6 + tensorflow/core/util/debug_events_writer.cc | 23 +++- tensorflow/core/util/debug_events_writer.h | 33 ++++-- .../core/util/debug_events_writer_test.cc | 104 ++++++++++-------- .../client/debug_events_writer_wrapper.cc | 59 +++++----- .../python/debug/lib/debug_events_reader.py | 5 + .../python/debug/lib/debug_events_writer.py | 6 +- .../debug/lib/debug_events_writer_test.py | 39 +++++-- .../python/debug/lib/debug_v2_ops_test.py | 7 +- .../python/debug/lib/dumping_callback.py | 15 ++- .../debug/lib/dumping_callback_test_lib.py | 2 + .../api/golden/v1/tensorflow.raw_ops.pbtxt | 2 +- .../api/golden/v2/tensorflow.raw_ops.pbtxt | 2 +- 15 files changed, 203 insertions(+), 112 deletions(-) diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h index 3fef822244d..498cd6146a8 100644 --- a/tensorflow/core/kernels/debug_ops.h +++ b/tensorflow/core/kernels/debug_ops.h @@ -410,7 +410,8 @@ class DebugIdentityV2Op : public OpKernel { : OpKernel(context), device_name_(context->device()->name()), output_slot_(-1), - tensor_debug_mode_(0) { + tensor_debug_mode_(0), + tfdbg_run_id_() { std::vector debug_urls; OP_REQUIRES_OK(context, context->GetAttr("debug_urls", &debug_urls)); for (const string& debug_url : debug_urls) { @@ -435,14 +436,17 @@ class DebugIdentityV2Op : public OpKernel { circular_buffer_size_ = tfdbg::DebugEventsWriter::kDefaultCyclicBufferSize; } + if (context->HasAttr("tfdbg_run_id")) { + OP_REQUIRES_OK(context, context->GetAttr("tfdbg_run_id", &tfdbg_run_id_)); + } } void Compute(OpKernelContext* context) override { const Tensor& tensor = context->input(0); for (const string& dump_root : dump_roots_) { tfdbg::DebugEventsWriter* debug_events_writer = - tfdbg::DebugEventsWriter::GetDebugEventsWriter(dump_root, - circular_buffer_size_); + tfdbg::DebugEventsWriter::GetDebugEventsWriter( + dump_root, tfdbg_run_id_, circular_buffer_size_); OP_REQUIRES_OK(context, debug_events_writer->WriteGraphExecutionTrace( tfdbg_context_id_, device_name_, op_name_, output_slot_, tensor_debug_mode_, tensor)); @@ -458,6 +462,7 @@ class DebugIdentityV2Op : public OpKernel { int32 output_slot_; int32 tensor_debug_mode_; int64 circular_buffer_size_; + string tfdbg_run_id_; }; typedef Eigen::ThreadPoolDevice CPUDevice; diff --git a/tensorflow/core/ops/debug_ops.cc b/tensorflow/core/ops/debug_ops.cc index 0ecc58a6a8f..ac67a0f75f3 100644 --- a/tensorflow/core/ops/debug_ops.cc +++ b/tensorflow/core/ops/debug_ops.cc @@ -91,6 +91,7 @@ REGISTER_OP("DebugIdentityV2") .Attr("tensor_debug_mode: int = -1") .Attr("debug_urls: list(string) = []") .Attr("circular_buffer_size: int = 1000") + .Attr("tfdbg_run_id: string = ''") .SetIsStateful() .SetShapeFn(shape_inference::UnchangedShape); diff --git a/tensorflow/core/protobuf/debug_event.proto b/tensorflow/core/protobuf/debug_event.proto index 005abe53194..5541c397fb8 100644 --- a/tensorflow/core/protobuf/debug_event.proto +++ b/tensorflow/core/protobuf/debug_event.proto @@ -115,6 +115,12 @@ message DebugMetadata { // Version of the DebugEvent file format. // Has a format of "debug.Event:", e.g., "debug.Event:1". string file_version = 2; + + // A unique ID for the current run of tfdbg. + // A run of tfdbg is defined as a TensorFlow job instrumented by tfdbg. + // Multiple hosts in a distributed TensorFlow job instrumented by tfdbg + // have the same ID. + string tfdbg_run_id = 3; } // Content of a source file involved in the execution of the debugged TensorFlow diff --git a/tensorflow/core/util/debug_events_writer.cc b/tensorflow/core/util/debug_events_writer.cc index d9c3393ce3c..8ee42959131 100644 --- a/tensorflow/core/util/debug_events_writer.cc +++ b/tensorflow/core/util/debug_events_writer.cc @@ -122,23 +122,31 @@ DebugEventsWriter::~DebugEventsWriter() { Close().IgnoreError(); } // static DebugEventsWriter* DebugEventsWriter::GetDebugEventsWriter( - const string& dump_root, int64 circular_buffer_size) { + const string& dump_root, const string& tfdbg_run_id, + int64 circular_buffer_size) { mutex_lock l(DebugEventsWriter::factory_mu_); std::unordered_map>* writer_pool = DebugEventsWriter::GetDebugEventsWriterMap(); if (writer_pool->find(dump_root) == writer_pool->end()) { std::unique_ptr writer( - new DebugEventsWriter(dump_root, circular_buffer_size)); + new DebugEventsWriter(dump_root, tfdbg_run_id, circular_buffer_size)); writer_pool->insert(std::make_pair(dump_root, std::move(writer))); } return (*writer_pool)[dump_root].get(); } // static -DebugEventsWriter* DebugEventsWriter::GetDebugEventsWriter( - const string& dump_root) { - return DebugEventsWriter::GetDebugEventsWriter(dump_root, - kDefaultCyclicBufferSize); +Status DebugEventsWriter::LookUpDebugEventsWriter( + const string& dump_root, DebugEventsWriter** debug_events_writer) { + mutex_lock l(DebugEventsWriter::factory_mu_); + std::unordered_map>* writer_pool = + DebugEventsWriter::GetDebugEventsWriterMap(); + if (writer_pool->find(dump_root) == writer_pool->end()) { + return errors::FailedPrecondition( + "No DebugEventsWriter has been created at dump root ", dump_root); + } + *debug_events_writer = (*writer_pool)[dump_root].get(); + return Status::OK(); } Status DebugEventsWriter::Init() { @@ -179,6 +187,7 @@ Status DebugEventsWriter::Init() { metadata->set_tensorflow_version(TF_VERSION_STRING); metadata->set_file_version( strings::Printf("%s%d", kVersionPrefix, kCurrentFormatVersion)); + metadata->set_tfdbg_run_id(tfdbg_run_id_); TF_RETURN_IF_ERROR(SerializeAndWriteDebugEvent(&debug_event, METADATA)); TF_RETURN_WITH_CONTEXT_IF_ERROR( metadata_writer_->Flush(), "Failed to flush debug event metadata writer"); @@ -457,9 +466,11 @@ DebugEventsWriter::GetDebugEventsWriterMap() { } DebugEventsWriter::DebugEventsWriter(const string& dump_root, + const string& tfdbg_run_id, int64 circular_buffer_size) : env_(Env::Default()), dump_root_(dump_root), + tfdbg_run_id_(tfdbg_run_id), is_initialized_(false), initialization_mu_(), circular_buffer_size_(circular_buffer_size), diff --git a/tensorflow/core/util/debug_events_writer.h b/tensorflow/core/util/debug_events_writer.h index 39835adf1a6..412f947e22d 100644 --- a/tensorflow/core/util/debug_events_writer.h +++ b/tensorflow/core/util/debug_events_writer.h @@ -93,18 +93,27 @@ class DebugEventsWriter { // sets of six. The singleton pattern avoids storing multiple sets in a single // folder, which might cause confusion. // + // If an instance of DebugEventsWriter has already been created at a + // `dump_root`, calling this method with the same `dump_root` will return + // the existing instance. + // // Args: // dump_root: Dump root directory. If it doesn't exist, will be created. + // tfdbg_run_id: Debugging run ID of the writer. // circular_buffer_size: Circular buffer size (in number of DebugEvent // protos). If set to a value <=0, will abolish the circular-buffer // behavior. // Returns: // A pointer to a DebugEventsWriter object: a per-dump_root singleton. static DebugEventsWriter* GetDebugEventsWriter(const string& dump_root, + const string& tfdbg_run_id, int64 circular_buffer_size); - // Same as the 2-arg factory method above, but uses the default circular - // buffer size. - static DebugEventsWriter* GetDebugEventsWriter(const string& dump_root); + // Look up existing events writer by dump_root. + // If no DebugEventsWriter has been created at the dump_root, a non-OK + // Status will be returned. Else an OK status will be returned, with + // the pointer to the existing instance provided by reference. + static Status LookUpDebugEventsWriter( + const string& dump_root, DebugEventsWriter** debug_events_writer); ~DebugEventsWriter(); // Sets the debug event filenames and opens file for writing. @@ -116,8 +125,8 @@ class DebugEventsWriter { // deleted by another process), this will open a new file. Status Init(); - // The four DebugEvent fields below are written _without_ the circular buffer. - // Source file contents are written to the *.source_files file. + // The four DebugEvent fields below are written _without_ the circular + // buffer. Source file contents are written to the *.source_files file. // Takes ownership of source_file. Status WriteSourceFile(SourceFile* source_file); // Stack frames are written to the *.code_locations file. @@ -132,9 +141,8 @@ class DebugEventsWriter { // The two DebugEvent fields below are written to the circular buffer // and saved to disk only at the FlushExecutionFiles() call. - // Execution events (eager execution of an op or a tf.function) are written to - // the *.execution file. - // Takes ownership of execution. + // Execution events (eager execution of an op or a tf.function) are written + // to the *.execution file. Takes ownership of execution. Status WriteExecution(Execution* execution); // Graph execution traces (graph-internal tensor values or their summaries) // are written to the *.graph_execution_traces file. @@ -151,8 +159,9 @@ class DebugEventsWriter { // which the trace concerns multiple tensors, this is an empty string. // output_slot: Output slot index of the op that this trace is concerned // with. - // tensor_debug_mode: An integer that represents the tensor-debug mode enum. - // tensor_value: The value of the tensor that describes the tensor(s) + // tensor_debug_mode: An integer that represents the tensor-debug mode + // enum. tensor_value: The value of the tensor that describes the + // tensor(s) // that this trace is concerned with. The semantics of this tensor value // depends on the value of `tensor_debug_mode`. Status WriteGraphExecutionTrace(const string& tfdbg_context_id, @@ -208,7 +217,8 @@ class DebugEventsWriter { // Guards calls to the GetDebugEventsWriter() method. static mutex factory_mu_; - DebugEventsWriter(const string& dump_root, int64 circular_buffer_size); + DebugEventsWriter(const string& dump_root, const string& tfdbg_run_id, + int64 circular_buffer_size); // Get the path prefix. The same for all files, which differ only in the // suffix. @@ -227,6 +237,7 @@ class DebugEventsWriter { Env* env_; const string dump_root_; + const string tfdbg_run_id_; string file_prefix_; bool is_initialized_ TF_GUARDED_BY(initialization_mu_); diff --git a/tensorflow/core/util/debug_events_writer_test.cc b/tensorflow/core/util/debug_events_writer_test.cc index bd0c731bc90..45895763673 100644 --- a/tensorflow/core/util/debug_events_writer_test.cc +++ b/tensorflow/core/util/debug_events_writer_test.cc @@ -71,6 +71,7 @@ class DebugEventsWriterTest : public ::testing::Test { dump_root_ = io::JoinPath( testing::TmpDir(), strings::Printf("%010lld", static_cast(env()->NowMicros()))); + tfdbg_run_id_ = "test_tfdbg_run_id"; } void TearDown() override { @@ -85,14 +86,15 @@ class DebugEventsWriterTest : public ::testing::Test { } string dump_root_; + string tfdbg_run_id_; }; TEST_F(DebugEventsWriterTest, GetDebugEventsWriterSameRootGivesSameObject) { // Test the per-dump_root_ singleton pattern. - DebugEventsWriter* writer_1 = - DebugEventsWriter::GetDebugEventsWriter(dump_root_); - DebugEventsWriter* writer_2 = - DebugEventsWriter::GetDebugEventsWriter(dump_root_); + DebugEventsWriter* writer_1 = DebugEventsWriter::GetDebugEventsWriter( + dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize); + DebugEventsWriter* writer_2 = DebugEventsWriter::GetDebugEventsWriter( + dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize); EXPECT_EQ(writer_1, writer_2); } @@ -103,8 +105,8 @@ TEST_F(DebugEventsWriterTest, ConcurrentGetDebugEventsWriterSameDumpRoot) { std::vector writers; mutex mu; auto fn = [this, &writers, &mu]() { - DebugEventsWriter* writer = - DebugEventsWriter::GetDebugEventsWriter(dump_root_); + DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter( + dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize); { mutex_lock l(mu); writers.push_back(writer); @@ -131,8 +133,9 @@ TEST_F(DebugEventsWriterTest, ConcurrentGetDebugEventsWriterDiffDumpRoots) { auto fn = [this, &counter, &writers, &mu]() { const string new_dump_root = io::JoinPath(dump_root_, strings::Printf("%ld", counter.fetch_add(1))); - DebugEventsWriter* writer = - DebugEventsWriter::GetDebugEventsWriter(new_dump_root); + DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter( + new_dump_root, tfdbg_run_id_, + DebugEventsWriter::kDefaultCyclicBufferSize); { mutex_lock l(mu); writers.push_back(writer); @@ -151,17 +154,17 @@ TEST_F(DebugEventsWriterTest, ConcurrentGetDebugEventsWriterDiffDumpRoots) { TEST_F(DebugEventsWriterTest, GetDebugEventsWriterDifferentRoots) { // Test the DebugEventsWriters for different directories are different. - DebugEventsWriter* writer_1 = - DebugEventsWriter::GetDebugEventsWriter(dump_root_); + DebugEventsWriter* writer_1 = DebugEventsWriter::GetDebugEventsWriter( + dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize); const string dump_root_2 = io::JoinPath(dump_root_, "subdirectory"); - DebugEventsWriter* writer_2 = - DebugEventsWriter::GetDebugEventsWriter(dump_root_2); + DebugEventsWriter* writer_2 = DebugEventsWriter::GetDebugEventsWriter( + dump_root_2, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize); EXPECT_NE(writer_1, writer_2); } TEST_F(DebugEventsWriterTest, GetAndInitDebugEventsWriter) { - DebugEventsWriter* writer = - DebugEventsWriter::GetDebugEventsWriter(dump_root_); + DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter( + dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize); TF_ASSERT_OK(writer->Init()); TF_ASSERT_OK(writer->Close()); @@ -174,6 +177,8 @@ TEST_F(DebugEventsWriterTest, GetAndInitDebugEventsWriter) { const string file_version = actuals[0].debug_metadata().file_version(); EXPECT_EQ(file_version.find(DebugEventsWriter::kVersionPrefix), 0); EXPECT_GT(file_version.size(), strlen(DebugEventsWriter::kVersionPrefix)); + // Check the tfdbg run ID. + EXPECT_EQ(actuals[0].debug_metadata().tfdbg_run_id(), "test_tfdbg_run_id"); // Verify that the .source_files file has been created and is empty. ReadDebugEventProtos(writer, DebugEventFileType::SOURCE_FILES, &actuals); @@ -182,22 +187,22 @@ TEST_F(DebugEventsWriterTest, GetAndInitDebugEventsWriter) { } TEST_F(DebugEventsWriterTest, CallingCloseWithoutInitIsOkay) { - DebugEventsWriter* writer = - DebugEventsWriter::GetDebugEventsWriter(dump_root_); + DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter( + dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize); TF_ASSERT_OK(writer->Close()); } TEST_F(DebugEventsWriterTest, CallingCloseTwiceIsOkay) { - DebugEventsWriter* writer = - DebugEventsWriter::GetDebugEventsWriter(dump_root_); + DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter( + dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize); TF_ASSERT_OK(writer->Close()); TF_ASSERT_OK(writer->Close()); } TEST_F(DebugEventsWriterTest, ConcurrentInitCalls) { // Test that concurrent calls to Init() works correctly. - DebugEventsWriter* writer = - DebugEventsWriter::GetDebugEventsWriter(dump_root_); + DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter( + dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize); thread::ThreadPool* thread_pool = new thread::ThreadPool(Env::Default(), "test_pool", 4); @@ -218,6 +223,7 @@ TEST_F(DebugEventsWriterTest, ConcurrentInitCalls) { const string file_version = actuals[0].debug_metadata().file_version(); EXPECT_EQ(file_version.find(DebugEventsWriter::kVersionPrefix), 0); EXPECT_GT(file_version.size(), strlen(DebugEventsWriter::kVersionPrefix)); + EXPECT_EQ(actuals[0].debug_metadata().tfdbg_run_id(), "test_tfdbg_run_id"); // Verify that the .source_files file has been created and is empty. ReadDebugEventProtos(writer, DebugEventFileType::SOURCE_FILES, &actuals); @@ -227,14 +233,15 @@ TEST_F(DebugEventsWriterTest, ConcurrentInitCalls) { TEST_F(DebugEventsWriterTest, InitTwiceDoesNotCreateNewMetadataFile) { // Test that Init() is idempotent. - DebugEventsWriter* writer = - DebugEventsWriter::GetDebugEventsWriter(dump_root_); + DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter( + dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize); TF_ASSERT_OK(writer->Init()); std::vector actuals; ReadDebugEventProtos(writer, DebugEventFileType::METADATA, &actuals); EXPECT_EQ(actuals.size(), 1); EXPECT_GT(actuals[0].debug_metadata().tensorflow_version().length(), 0); + EXPECT_EQ(actuals[0].debug_metadata().tfdbg_run_id(), "test_tfdbg_run_id"); EXPECT_GE(actuals[0].debug_metadata().file_version().size(), 0); string metadata_path_1 = @@ -248,12 +255,13 @@ TEST_F(DebugEventsWriterTest, InitTwiceDoesNotCreateNewMetadataFile) { ReadDebugEventProtos(writer, DebugEventFileType::METADATA, &actuals); EXPECT_EQ(actuals.size(), 1); EXPECT_GT(actuals[0].debug_metadata().tensorflow_version().length(), 0); + EXPECT_EQ(actuals[0].debug_metadata().tfdbg_run_id(), "test_tfdbg_run_id"); EXPECT_GE(actuals[0].debug_metadata().file_version().size(), 0); } TEST_F(DebugEventsWriterTest, WriteSourceFile) { - DebugEventsWriter* writer = - DebugEventsWriter::GetDebugEventsWriter(dump_root_); + DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter( + dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize); TF_ASSERT_OK(writer->Init()); SourceFile* source_file_1 = new SourceFile(); @@ -313,8 +321,8 @@ TEST_F(DebugEventsWriterTest, WriteSourceFile) { } TEST_F(DebugEventsWriterTest, WriteStackFramesFile) { - DebugEventsWriter* writer = - DebugEventsWriter::GetDebugEventsWriter(dump_root_); + DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter( + dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize); TF_ASSERT_OK(writer->Init()); StackFrameWithId* stack_frame_1 = new StackFrameWithId(); @@ -375,8 +383,8 @@ TEST_F(DebugEventsWriterTest, WriteStackFramesFile) { } TEST_F(DebugEventsWriterTest, WriteGraphOpCreationAndDebuggedGraph) { - DebugEventsWriter* writer = - DebugEventsWriter::GetDebugEventsWriter(dump_root_); + DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter( + dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize); TF_ASSERT_OK(writer->Init()); GraphOpCreation* graph_op_creation = new GraphOpCreation(); @@ -415,8 +423,8 @@ TEST_F(DebugEventsWriterTest, WriteGraphOpCreationAndDebuggedGraph) { TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheSameFile) { const size_t kConcurrentWrites = 100; - DebugEventsWriter* writer = - DebugEventsWriter::GetDebugEventsWriter(dump_root_); + DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter( + dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize); TF_ASSERT_OK(writer->Init()); thread::ThreadPool* thread_pool = @@ -456,8 +464,8 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheSameFile) { TEST_F(DebugEventsWriterTest, ConcurrentWriteAndFlushCallsToTheSameFile) { const size_t kConcurrentWrites = 100; - DebugEventsWriter* writer = - DebugEventsWriter::GetDebugEventsWriter(dump_root_); + DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter( + dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize); TF_ASSERT_OK(writer->Init()); thread::ThreadPool* thread_pool = @@ -498,8 +506,8 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteAndFlushCallsToTheSameFile) { TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheDifferentFiles) { const int32 kConcurrentWrites = 30; - DebugEventsWriter* writer = - DebugEventsWriter::GetDebugEventsWriter(dump_root_); + DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter( + dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize); TF_ASSERT_OK(writer->Init()); thread::ThreadPool* thread_pool = @@ -576,8 +584,8 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheDifferentFiles) { TEST_F(DebugEventsWriterTest, WriteExecutionWithCyclicBufferNoFlush) { // Verify that no writing to disk happens until the flushing method is called. const size_t kCyclicBufferSize = 10; - DebugEventsWriter* writer = - DebugEventsWriter::GetDebugEventsWriter(dump_root_, kCyclicBufferSize); + DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter( + dump_root_, tfdbg_run_id_, kCyclicBufferSize); TF_ASSERT_OK(writer->Init()); // First, try writing and flushing more debug events than the capacity @@ -601,8 +609,8 @@ TEST_F(DebugEventsWriterTest, WriteExecutionWithCyclicBufferNoFlush) { TEST_F(DebugEventsWriterTest, WriteExecutionWithCyclicBufferFlush) { // Verify that writing to disk happens when the flushing method is called. const size_t kCyclicBufferSize = 10; - DebugEventsWriter* writer = - DebugEventsWriter::GetDebugEventsWriter(dump_root_, kCyclicBufferSize); + DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter( + dump_root_, tfdbg_run_id_, kCyclicBufferSize); TF_ASSERT_OK(writer->Init()); // First, try writing and flushing more debug events than the capacity @@ -673,8 +681,8 @@ TEST_F(DebugEventsWriterTest, WriteExecutionWithCyclicBufferFlush) { TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferNoFlush) { // Check no writing to disk happens before the flushing method is called. const size_t kCyclicBufferSize = 10; - DebugEventsWriter* writer = - DebugEventsWriter::GetDebugEventsWriter(dump_root_, kCyclicBufferSize); + DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter( + dump_root_, tfdbg_run_id_, kCyclicBufferSize); TF_ASSERT_OK(writer->Init()); // First, try writing and flushing more debug events than the capacity @@ -697,8 +705,8 @@ TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferNoFlush) { TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithoutPreviousInitCall) { const size_t kCyclicBufferSize = -1; - DebugEventsWriter* writer = - DebugEventsWriter::GetDebugEventsWriter(dump_root_, kCyclicBufferSize); + DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter( + dump_root_, tfdbg_run_id_, kCyclicBufferSize); // NOTE(cais): `writer->Init()` is not called here before // WriteGraphExecutionTrace() is called. This test checks that this is okay // and the `GraphExecutionTrace` gets written correctly even without `Init()` @@ -722,8 +730,8 @@ TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithoutPreviousInitCall) { TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferFlush) { const size_t kCyclicBufferSize = 10; - DebugEventsWriter* writer = - DebugEventsWriter::GetDebugEventsWriter(dump_root_, kCyclicBufferSize); + DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter( + dump_root_, tfdbg_run_id_, kCyclicBufferSize); TF_ASSERT_OK(writer->Init()); // First, try writing and flushing more debug events than the capacity @@ -788,8 +796,8 @@ TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferFlush) { } TEST_F(DebugEventsWriterTest, RegisterDeviceAndGetIdTrace) { - DebugEventsWriter* writer = - DebugEventsWriter::GetDebugEventsWriter(dump_root_); + DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter( + dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize); TF_ASSERT_OK(writer->Init()); // Register and get some device IDs in a concurrent fashion. @@ -833,8 +841,8 @@ TEST_F(DebugEventsWriterTest, RegisterDeviceAndGetIdTrace) { TEST_F(DebugEventsWriterTest, DisableCyclicBufferBehavior) { const size_t kCyclicBufferSize = 0; // A value <= 0 disables cyclic behavior. - DebugEventsWriter* writer = - DebugEventsWriter::GetDebugEventsWriter(dump_root_, kCyclicBufferSize); + DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter( + dump_root_, tfdbg_run_id_, kCyclicBufferSize); TF_ASSERT_OK(writer->Init()); const size_t kNumEvents = 20; diff --git a/tensorflow/python/client/debug_events_writer_wrapper.cc b/tensorflow/python/client/debug_events_writer_wrapper.cc index a786c6f2db6..15802df40fe 100644 --- a/tensorflow/python/client/debug_events_writer_wrapper.cc +++ b/tensorflow/python/client/debug_events_writer_wrapper.cc @@ -29,9 +29,10 @@ PYBIND11_MODULE(_pywrap_debug_events_writer, m) { using namespace tensorflow::tfdbg; // NOLINT(build/namespaces) m.def("Init", - [](const std::string& dump_root, const int64 circular_buffer_size) { + [](const std::string& dump_root, const std::string& tfdbg_run_id, + const int64 circular_buffer_size) { DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter( - dump_root, circular_buffer_size); + dump_root, tfdbg_run_id, circular_buffer_size); if (!writer->Init().ok()) { throw py::value_error(tensorflow::strings::Printf( "Failed to initialize debug events writer at: %s", @@ -41,8 +42,9 @@ PYBIND11_MODULE(_pywrap_debug_events_writer, m) { m.def("WriteSourceFile", [](const std::string& dump_root, const py::object obj) { CheckProtoType(obj, "tensorflow.DebugEvent"); - DebugEventsWriter* writer = - DebugEventsWriter::GetDebugEventsWriter(dump_root); + DebugEventsWriter* writer = nullptr; + TF_CHECK_OK( + DebugEventsWriter::LookUpDebugEventsWriter(dump_root, &writer)); writer->WriteSerializedNonExecutionDebugEvent( obj.attr("SerializeToString")().cast(), tfdbg::DebugEventFileType::SOURCE_FILES); @@ -50,8 +52,9 @@ PYBIND11_MODULE(_pywrap_debug_events_writer, m) { m.def("WriteStackFrameWithId", [](const std::string& dump_root, const py::object& obj) { CheckProtoType(obj, "tensorflow.DebugEvent"); - DebugEventsWriter* writer = - DebugEventsWriter::GetDebugEventsWriter(dump_root); + DebugEventsWriter* writer = nullptr; + TF_CHECK_OK( + DebugEventsWriter::LookUpDebugEventsWriter(dump_root, &writer)); writer->WriteSerializedNonExecutionDebugEvent( obj.attr("SerializeToString")().cast(), tfdbg::DebugEventFileType::STACK_FRAMES); @@ -59,8 +62,9 @@ PYBIND11_MODULE(_pywrap_debug_events_writer, m) { m.def("WriteGraphOpCreation", [](const std::string& dump_root, const py::object& obj) { CheckProtoType(obj, "tensorflow.DebugEvent"); - DebugEventsWriter* writer = - DebugEventsWriter::GetDebugEventsWriter(dump_root); + DebugEventsWriter* writer = nullptr; + TF_CHECK_OK( + DebugEventsWriter::LookUpDebugEventsWriter(dump_root, &writer)); writer->WriteSerializedNonExecutionDebugEvent( obj.attr("SerializeToString")().cast(), tfdbg::DebugEventFileType::GRAPHS); @@ -68,8 +72,9 @@ PYBIND11_MODULE(_pywrap_debug_events_writer, m) { m.def("WriteDebuggedGraph", [](const std::string& dump_root, const py::object& obj) { CheckProtoType(obj, "tensorflow.DebugEvent"); - DebugEventsWriter* writer = - DebugEventsWriter::GetDebugEventsWriter(dump_root); + DebugEventsWriter* writer = nullptr; + TF_CHECK_OK( + DebugEventsWriter::LookUpDebugEventsWriter(dump_root, &writer)); writer->WriteSerializedNonExecutionDebugEvent( obj.attr("SerializeToString")().cast(), tfdbg::DebugEventFileType::GRAPHS); @@ -77,8 +82,9 @@ PYBIND11_MODULE(_pywrap_debug_events_writer, m) { m.def("WriteExecution", [](const std::string& dump_root, const py::object& obj) { CheckProtoType(obj, "tensorflow.DebugEvent"); - DebugEventsWriter* writer = - DebugEventsWriter::GetDebugEventsWriter(dump_root); + DebugEventsWriter* writer = nullptr; + TF_CHECK_OK( + DebugEventsWriter::LookUpDebugEventsWriter(dump_root, &writer)); writer->WriteSerializedExecutionDebugEvent( obj.attr("SerializeToString")().cast(), tfdbg::DebugEventFileType::EXECUTION); @@ -86,31 +92,32 @@ PYBIND11_MODULE(_pywrap_debug_events_writer, m) { m.def("WriteGraphExecutionTrace", [](const std::string& dump_root, const py::object& obj) { CheckProtoType(obj, "tensorflow.DebugEvent"); - DebugEventsWriter* writer = - DebugEventsWriter::GetDebugEventsWriter(dump_root); + DebugEventsWriter* writer = nullptr; + TF_CHECK_OK( + DebugEventsWriter::LookUpDebugEventsWriter(dump_root, &writer)); writer->WriteSerializedExecutionDebugEvent( obj.attr("SerializeToString")().cast(), tfdbg::DebugEventFileType::GRAPH_EXECUTION_TRACES); }); - m.def("RegisterDeviceAndGetId", - [](const std::string& dump_root, const std::string& device_name) { - DebugEventsWriter* writer = - DebugEventsWriter::GetDebugEventsWriter(dump_root); - return writer->RegisterDeviceAndGetId(device_name); - }); + m.def("RegisterDeviceAndGetId", [](const std::string& dump_root, + const std::string& device_name) { + DebugEventsWriter* writer = nullptr; + TF_CHECK_OK(DebugEventsWriter::LookUpDebugEventsWriter(dump_root, &writer)); + return writer->RegisterDeviceAndGetId(device_name); + }); m.def("FlushNonExecutionFiles", [](const std::string& dump_root) { - DebugEventsWriter* writer = - DebugEventsWriter::GetDebugEventsWriter(dump_root); + DebugEventsWriter* writer = nullptr; + TF_CHECK_OK(DebugEventsWriter::LookUpDebugEventsWriter(dump_root, &writer)); writer->FlushNonExecutionFiles(); }); m.def("FlushExecutionFiles", [](const std::string& dump_root) { - DebugEventsWriter* writer = - DebugEventsWriter::GetDebugEventsWriter(dump_root); + DebugEventsWriter* writer = nullptr; + TF_CHECK_OK(DebugEventsWriter::LookUpDebugEventsWriter(dump_root, &writer)); writer->FlushExecutionFiles(); }); m.def("Close", [](const std::string& dump_root) { - DebugEventsWriter* writer = - DebugEventsWriter::GetDebugEventsWriter(dump_root); + DebugEventsWriter* writer = nullptr; + TF_CHECK_OK(DebugEventsWriter::LookUpDebugEventsWriter(dump_root, &writer)); writer->Close(); }); }; diff --git a/tensorflow/python/debug/lib/debug_events_reader.py b/tensorflow/python/debug/lib/debug_events_reader.py index 4adb97de25b..743cea7103a 100644 --- a/tensorflow/python/debug/lib/debug_events_reader.py +++ b/tensorflow/python/debug/lib/debug_events_reader.py @@ -863,6 +863,7 @@ class DebugDataReader(object): debug_event = next(metadata_iter).debug_event self._starting_wall_time = debug_event.wall_time self._tensorflow_version = debug_event.debug_metadata.tensorflow_version + self._tfdbg_run_id = debug_event.debug_metadata.tfdbg_run_id def _load_source_files(self): """Incrementally read the .source_files DebugEvent file.""" @@ -1071,6 +1072,10 @@ class DebugDataReader(object): """ return self._tensorflow_version + def tfdbg_run_id(self): + """Get the debugger run ID of the debugged TensorFlow program.""" + return self._tfdbg_run_id + def outermost_graphs(self): """Get the number of outer most graphs read so far.""" return [graph for graph in self._graph_by_id.values() diff --git a/tensorflow/python/debug/lib/debug_events_writer.py b/tensorflow/python/debug/lib/debug_events_writer.py index 3de0ab78b8a..f223abdd099 100644 --- a/tensorflow/python/debug/lib/debug_events_writer.py +++ b/tensorflow/python/debug/lib/debug_events_writer.py @@ -32,6 +32,7 @@ class DebugEventsWriter(object): def __init__(self, dump_root, + tfdbg_run_id, circular_buffer_size=DEFAULT_CIRCULAR_BUFFER_SIZE): """Construct a DebugEventsWriter object. @@ -43,6 +44,7 @@ class DebugEventsWriter(object): Args: dump_root: The root directory for dumping debug data. If `dump_root` does not exist as a directory, it will be created. + tfdbg_run_id: Debugger Run ID. circular_buffer_size: Size of the circular buffer for each of the two execution-related debug events files: with the following suffixes: - .execution - .graph_execution_traces If <= 0, the circular-buffer @@ -51,7 +53,9 @@ class DebugEventsWriter(object): if not dump_root: raise ValueError("Empty or None dump root") self._dump_root = dump_root - _pywrap_debug_events_writer.Init(self._dump_root, circular_buffer_size) + self._tfdbg_run_id = tfdbg_run_id + _pywrap_debug_events_writer.Init(self._dump_root, self._tfdbg_run_id, + circular_buffer_size) def WriteSourceFile(self, source_file): """Write a SourceFile proto with the writer. diff --git a/tensorflow/python/debug/lib/debug_events_writer_test.py b/tensorflow/python/debug/lib/debug_events_writer_test.py index 57721c1450f..7b06bf772be 100644 --- a/tensorflow/python/debug/lib/debug_events_writer_test.py +++ b/tensorflow/python/debug/lib/debug_events_writer_test.py @@ -41,7 +41,7 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase, def testMultiThreadedConstructorCallWorks(self): def init_writer(): - debug_events_writer.DebugEventsWriter(self.dump_root) + debug_events_writer.DebugEventsWriter(self.dump_root, self.tfdbg_run_id) num_threads = 4 threads = [] @@ -66,7 +66,8 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase, self._readAndCheckMetadataFile() def testWriteSourceFilesAndStackFrames(self): - writer = debug_events_writer.DebugEventsWriter(self.dump_root) + writer = debug_events_writer.DebugEventsWriter(self.dump_root, + self.tfdbg_run_id) num_protos = 10 for i in range(num_protos): source_file = debug_event_pb2.SourceFile() @@ -99,7 +100,8 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase, self.assertEqual(actuals[i].file_line_col.file_index, i * 10) def testWriteGraphOpCreationAndDebuggedGraphs(self): - writer = debug_events_writer.DebugEventsWriter(self.dump_root) + writer = debug_events_writer.DebugEventsWriter(self.dump_root, + self.tfdbg_run_id) num_op_creations = 10 for i in range(num_op_creations): graph_op_creation = debug_event_pb2.GraphOpCreation() @@ -122,7 +124,8 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase, "deadbeaf") def testConcurrentWritesToNonExecutionFilesWorks(self): - writer = debug_events_writer.DebugEventsWriter(self.dump_root) + writer = debug_events_writer.DebugEventsWriter(self.dump_root, + self.tfdbg_run_id) source_file_state = {"counter": 0, "lock": threading.Lock()} @@ -201,15 +204,18 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase, def testWriteAndReadMetadata(self): t0 = time.time() - writer = debug_events_writer.DebugEventsWriter(self.dump_root) + writer = debug_events_writer.DebugEventsWriter(self.dump_root, + self.tfdbg_run_id) writer.Close() with debug_events_reader.DebugDataReader(self.dump_root) as reader: self.assertIsInstance(reader.starting_wall_time(), float) self.assertGreaterEqual(reader.starting_wall_time(), t0) self.assertEqual(reader.tensorflow_version(), versions.__version__) + self.assertTrue(reader.tfdbg_run_id()) def testWriteExecutionEventsWithCircularBuffer(self): - writer = debug_events_writer.DebugEventsWriter(self.dump_root) + writer = debug_events_writer.DebugEventsWriter(self.dump_root, + self.tfdbg_run_id) num_execution_events = debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE * 2 for i in range(num_execution_events): execution = debug_event_pb2.Execution() @@ -232,7 +238,8 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase, def testWriteExecutionEventsWithoutCircularBufferBehavior(self): # A circular buffer size of 0 abolishes the circular buffer behavior. - writer = debug_events_writer.DebugEventsWriter(self.dump_root, 0) + writer = debug_events_writer.DebugEventsWriter(self.dump_root, + self.tfdbg_run_id, 0) num_execution_events = debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE * 2 for i in range(num_execution_events): execution = debug_event_pb2.Execution() @@ -248,7 +255,8 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase, self.assertEqual(execution.op_type, "OpType%d" % i) def testWriteGraphExecutionTraceEventsWithCircularBuffer(self): - writer = debug_events_writer.DebugEventsWriter(self.dump_root) + writer = debug_events_writer.DebugEventsWriter(self.dump_root, + self.tfdbg_run_id) num_execution_events = debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE * 2 for i in range(num_execution_events): trace = debug_event_pb2.GraphExecutionTrace() @@ -272,7 +280,8 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase, def testWriteGraphExecutionTraceEventsWithoutCircularBufferBehavior(self): # A circular buffer size of 0 abolishes the circular buffer behavior. - writer = debug_events_writer.DebugEventsWriter(self.dump_root, 0) + writer = debug_events_writer.DebugEventsWriter(self.dump_root, + self.tfdbg_run_id, 0) num_execution_events = debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE * 2 for i in range(num_execution_events): trace = debug_event_pb2.GraphExecutionTrace() @@ -290,6 +299,7 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase, def testConcurrentWritesToExecutionFiles(self): circular_buffer_size = 5 writer = debug_events_writer.DebugEventsWriter(self.dump_root, + self.tfdbg_run_id, circular_buffer_size) debugged_graph = debug_event_pb2.DebuggedGraph(graph_id="graph1", graph_name="graph1") @@ -345,7 +355,8 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase, self.assertLen(op_names, len(set(op_names))) def testConcurrentSourceFileRandomReads(self): - writer = debug_events_writer.DebugEventsWriter(self.dump_root) + writer = debug_events_writer.DebugEventsWriter(self.dump_root, + self.tfdbg_run_id) for i in range(100): source_file = debug_event_pb2.SourceFile( @@ -376,6 +387,7 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase, def testConcurrentExecutionUpdateAndRandomRead(self): circular_buffer_size = -1 writer = debug_events_writer.DebugEventsWriter(self.dump_root, + self.tfdbg_run_id, circular_buffer_size) writer_state = {"counter": 0, "done": False} @@ -410,6 +422,7 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase, def testConcurrentExecutionRandomReads(self): circular_buffer_size = -1 writer = debug_events_writer.DebugEventsWriter(self.dump_root, + self.tfdbg_run_id, circular_buffer_size) for i in range(100): @@ -445,6 +458,7 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase, def testConcurrentGraphExecutionTraceUpdateAndRandomRead(self): circular_buffer_size = -1 writer = debug_events_writer.DebugEventsWriter(self.dump_root, + self.tfdbg_run_id, circular_buffer_size) debugged_graph = debug_event_pb2.DebuggedGraph(graph_id="graph1", graph_name="graph1") @@ -487,6 +501,7 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase, def testConcurrentGraphExecutionTraceRandomReads(self): circular_buffer_size = -1 writer = debug_events_writer.DebugEventsWriter(self.dump_root, + self.tfdbg_run_id, circular_buffer_size) debugged_graph = debug_event_pb2.DebuggedGraph(graph_id="graph1", graph_name="graph1") @@ -534,7 +549,7 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase, def testRangeReadingExecutions(self, begin, end, expected_begin, expected_end): writer = debug_events_writer.DebugEventsWriter( - self.dump_root, circular_buffer_size=-1) + self.dump_root, self.tfdbg_run_id, circular_buffer_size=-1) for i in range(5): execution = debug_event_pb2.Execution(op_type="OpType%d" % i) writer.WriteExecution(execution) @@ -559,7 +574,7 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase, def testRangeReadingGraphExecutionTraces(self, begin, end, expected_begin, expected_end): writer = debug_events_writer.DebugEventsWriter( - self.dump_root, circular_buffer_size=-1) + self.dump_root, self.tfdbg_run_id, circular_buffer_size=-1) debugged_graph = debug_event_pb2.DebuggedGraph( graph_id="graph1", graph_name="graph1") writer.WriteDebuggedGraph(debugged_graph) diff --git a/tensorflow/python/debug/lib/debug_v2_ops_test.py b/tensorflow/python/debug/lib/debug_v2_ops_test.py index 10de01f4f2e..d715869f359 100644 --- a/tensorflow/python/debug/lib/debug_v2_ops_test.py +++ b/tensorflow/python/debug/lib/debug_v2_ops_test.py @@ -52,8 +52,9 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase): super(DebugIdentityV2OpTest, self).setUp() # Testing using a small circular-buffer size. self.circular_buffer_size = 4 + self.tfdbg_run_id = "test_tfdbg_run" self.writer = debug_events_writer.DebugEventsWriter( - self.dump_root, self.circular_buffer_size) + self.dump_root, self.tfdbg_run_id, self.circular_buffer_size) def tearDown(self): self.writer.Close() @@ -192,7 +193,8 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase): def testTwoDumpRoots(self): another_dump_root = os.path.join(self.dump_root, "another") another_debug_url = "file://%s" % another_dump_root - another_writer = debug_events_writer.DebugEventsWriter(another_dump_root) + another_writer = debug_events_writer.DebugEventsWriter( + another_dump_root, "test_tfdbg_run") @def_function.function def write_debug_trace(x): @@ -264,6 +266,7 @@ class DebugIdentityV2OpUninitializedWriterTest( self.assertAllClose( write_debug_trace(np.array([i]).astype(np.float32)), [i**2.0]) writer = debug_events_writer.DebugEventsWriter(self.dump_root, + "test_tfdbg_run", circular_buffer_size) writer.FlushNonExecutionFiles() writer.FlushExecutionFiles() diff --git a/tensorflow/python/debug/lib/dumping_callback.py b/tensorflow/python/debug/lib/dumping_callback.py index 0f5836e0644..563b52f8f63 100644 --- a/tensorflow/python/debug/lib/dumping_callback.py +++ b/tensorflow/python/debug/lib/dumping_callback.py @@ -69,6 +69,10 @@ def _debug_identity_v2_grad(op, dy): return dy +def _get_tfdbg_run_id(): + return str(uuid.uuid4())[:8] + + def _get_id(): """Get a short unique ID.""" return str(uuid.uuid4()) @@ -88,6 +92,7 @@ class _DumpingCallback(object): op_regex, tensor_dtypes): self._dump_root = dump_root + self._tfdbg_run_id = _get_tfdbg_run_id() self._tensor_debug_mode = tensor_debug_mode self._circular_buffer_size = circular_buffer_size self._op_regex = op_regex @@ -148,6 +153,10 @@ class _DumpingCallback(object): self._dump_root = dump_root self._writer = None + @property + def tfdbg_run_id(self): + return self._tfdbg_run_id + @property def tensor_debug_mode(self): return self._tensor_debug_mode @@ -161,6 +170,7 @@ class _DumpingCallback(object): if not self._writer: self._writer = debug_events_writer.DebugEventsWriter( self._dump_root, + self._tfdbg_run_id, circular_buffer_size=self._circular_buffer_size) return self._writer @@ -365,6 +375,8 @@ class _DumpingCallback(object): if tf_compat.forward_compatible(2020, 6, 24): debug_identity_op_kwargs[ "circular_buffer_size"] = self._circular_buffer_size + if tf_compat.forward_compatible(2020, 7, 1): + debug_identity_op_kwargs["tfdbg_run_id"] = self._tfdbg_run_id if tensor_debug_mode == debug_event_pb2.TensorDebugMode.NO_TENSOR: if (not self._should_dump_tensor(op_type, tensor.dtype) or not tensor.dtype.is_numpy_compatible): @@ -873,7 +885,8 @@ def disable_dump_debug_info(): """ if hasattr(_state, "dumping_callback"): dump_root = _state.dumping_callback.dump_root - debug_events_writer.DebugEventsWriter(dump_root).Close() + tfdbg_run_id = _state.dumping_callback.tfdbg_run_id + debug_events_writer.DebugEventsWriter(dump_root, tfdbg_run_id).Close() op_callbacks.remove_op_callback(_state.dumping_callback.callback) function_lib.remove_function_callback( _state.dumping_callback.function_callback) diff --git a/tensorflow/python/debug/lib/dumping_callback_test_lib.py b/tensorflow/python/debug/lib/dumping_callback_test_lib.py index 164644c57fa..05bf3aeb6da 100644 --- a/tensorflow/python/debug/lib/dumping_callback_test_lib.py +++ b/tensorflow/python/debug/lib/dumping_callback_test_lib.py @@ -21,6 +21,7 @@ from __future__ import print_function import os import shutil import tempfile +import uuid from tensorflow.python.debug.lib import check_numerics_callback from tensorflow.python.debug.lib import debug_events_reader @@ -35,6 +36,7 @@ class DumpingCallbackTestBase(test_util.TensorFlowTestCase): def setUp(self): super(DumpingCallbackTestBase, self).setUp() self.dump_root = tempfile.mkdtemp() + self.tfdbg_run_id = str(uuid.uuid4()) def tearDown(self): if os.path.isdir(self.dump_root): diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt index 75ddf32cbe0..54d15b601c5 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt @@ -982,7 +982,7 @@ tf_module { } member_method { name: "DebugIdentityV2" - argspec: "args=[\'input\', \'tfdbg_context_id\', \'op_name\', \'output_slot\', \'tensor_debug_mode\', \'debug_urls\', \'circular_buffer_size\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'-1\', \'-1\', \'[]\', \'1000\', \'None\'], " + argspec: "args=[\'input\', \'tfdbg_context_id\', \'op_name\', \'output_slot\', \'tensor_debug_mode\', \'debug_urls\', \'circular_buffer_size\', \'tfdbg_run_id\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'-1\', \'-1\', \'[]\', \'1000\', \'\', \'None\'], " } member_method { name: "DebugNanCount" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt index 75ddf32cbe0..54d15b601c5 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt @@ -982,7 +982,7 @@ tf_module { } member_method { name: "DebugIdentityV2" - argspec: "args=[\'input\', \'tfdbg_context_id\', \'op_name\', \'output_slot\', \'tensor_debug_mode\', \'debug_urls\', \'circular_buffer_size\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'-1\', \'-1\', \'[]\', \'1000\', \'None\'], " + argspec: "args=[\'input\', \'tfdbg_context_id\', \'op_name\', \'output_slot\', \'tensor_debug_mode\', \'debug_urls\', \'circular_buffer_size\', \'tfdbg_run_id\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'-1\', \'-1\', \'[]\', \'1000\', \'\', \'None\'], " } member_method { name: "DebugNanCount"