[tfdbg2] Add tfdbg_run_id to metadata of data dumps

- A data dump file set generated by tfdbg2 can contain
  multiple subsets when there are multiple hosts involved
  in the instrumented TensorFlow job (e.g., TPUs and Parameter Servers).
  Currently, there is no bit in those subset of files that
  indicates they belong to the same instrumented TF job.
  - This CL addresses this problem by adding a field to the
    metadata proto used by those files (`tfdbg_run_id`)
- The DebugEventsWriter code is revised, so that this new
  field is written to the metadata file of the file set on the writer's
  construction.
- Also in this CL: remove the previous 1-arg `GetDebugEventsWriter(dump_root)`
  that creates the writer object if it doesn't exist at the specified
  dump_root. Replace it with `LookUpDebugEventsWriter(dump_root)` that only
  looks up the writer object and returns a non-OK status if such an object
  hasn't been created at `dump_root`. This makes the code less error prone by
  keeping only the fully-explicit, 3-arg `GetDebugEventsWriter()`.

PiperOrigin-RevId: 316537044
Change-Id: Id5be0b771fbf37c0fc796f1514ed858a0e6d38f0
This commit is contained in:
Shanqing Cai 2020-06-15 13:49:39 -07:00 committed by TensorFlower Gardener
parent 4381963d2d
commit a8950d70bf
15 changed files with 203 additions and 112 deletions

View File

@ -410,7 +410,8 @@ class DebugIdentityV2Op : public OpKernel {
: OpKernel(context), : OpKernel(context),
device_name_(context->device()->name()), device_name_(context->device()->name()),
output_slot_(-1), output_slot_(-1),
tensor_debug_mode_(0) { tensor_debug_mode_(0),
tfdbg_run_id_() {
std::vector<string> debug_urls; std::vector<string> debug_urls;
OP_REQUIRES_OK(context, context->GetAttr("debug_urls", &debug_urls)); OP_REQUIRES_OK(context, context->GetAttr("debug_urls", &debug_urls));
for (const string& debug_url : debug_urls) { for (const string& debug_url : debug_urls) {
@ -435,14 +436,17 @@ class DebugIdentityV2Op : public OpKernel {
circular_buffer_size_ = circular_buffer_size_ =
tfdbg::DebugEventsWriter::kDefaultCyclicBufferSize; tfdbg::DebugEventsWriter::kDefaultCyclicBufferSize;
} }
if (context->HasAttr("tfdbg_run_id")) {
OP_REQUIRES_OK(context, context->GetAttr("tfdbg_run_id", &tfdbg_run_id_));
}
} }
void Compute(OpKernelContext* context) override { void Compute(OpKernelContext* context) override {
const Tensor& tensor = context->input(0); const Tensor& tensor = context->input(0);
for (const string& dump_root : dump_roots_) { for (const string& dump_root : dump_roots_) {
tfdbg::DebugEventsWriter* debug_events_writer = tfdbg::DebugEventsWriter* debug_events_writer =
tfdbg::DebugEventsWriter::GetDebugEventsWriter(dump_root, tfdbg::DebugEventsWriter::GetDebugEventsWriter(
circular_buffer_size_); dump_root, tfdbg_run_id_, circular_buffer_size_);
OP_REQUIRES_OK(context, debug_events_writer->WriteGraphExecutionTrace( OP_REQUIRES_OK(context, debug_events_writer->WriteGraphExecutionTrace(
tfdbg_context_id_, device_name_, op_name_, tfdbg_context_id_, device_name_, op_name_,
output_slot_, tensor_debug_mode_, tensor)); output_slot_, tensor_debug_mode_, tensor));
@ -458,6 +462,7 @@ class DebugIdentityV2Op : public OpKernel {
int32 output_slot_; int32 output_slot_;
int32 tensor_debug_mode_; int32 tensor_debug_mode_;
int64 circular_buffer_size_; int64 circular_buffer_size_;
string tfdbg_run_id_;
}; };
typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::ThreadPoolDevice CPUDevice;

View File

@ -91,6 +91,7 @@ REGISTER_OP("DebugIdentityV2")
.Attr("tensor_debug_mode: int = -1") .Attr("tensor_debug_mode: int = -1")
.Attr("debug_urls: list(string) = []") .Attr("debug_urls: list(string) = []")
.Attr("circular_buffer_size: int = 1000") .Attr("circular_buffer_size: int = 1000")
.Attr("tfdbg_run_id: string = ''")
.SetIsStateful() .SetIsStateful()
.SetShapeFn(shape_inference::UnchangedShape); .SetShapeFn(shape_inference::UnchangedShape);

View File

@ -115,6 +115,12 @@ message DebugMetadata {
// Version of the DebugEvent file format. // Version of the DebugEvent file format.
// Has a format of "debug.Event:<number>", e.g., "debug.Event:1". // Has a format of "debug.Event:<number>", e.g., "debug.Event:1".
string file_version = 2; string file_version = 2;
// A unique ID for the current run of tfdbg.
// A run of tfdbg is defined as a TensorFlow job instrumented by tfdbg.
// Multiple hosts in a distributed TensorFlow job instrumented by tfdbg
// have the same ID.
string tfdbg_run_id = 3;
} }
// Content of a source file involved in the execution of the debugged TensorFlow // Content of a source file involved in the execution of the debugged TensorFlow

View File

@ -122,23 +122,31 @@ DebugEventsWriter::~DebugEventsWriter() { Close().IgnoreError(); }
// static // static
DebugEventsWriter* DebugEventsWriter::GetDebugEventsWriter( DebugEventsWriter* DebugEventsWriter::GetDebugEventsWriter(
const string& dump_root, int64 circular_buffer_size) { const string& dump_root, const string& tfdbg_run_id,
int64 circular_buffer_size) {
mutex_lock l(DebugEventsWriter::factory_mu_); mutex_lock l(DebugEventsWriter::factory_mu_);
std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>* writer_pool = std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>* writer_pool =
DebugEventsWriter::GetDebugEventsWriterMap(); DebugEventsWriter::GetDebugEventsWriterMap();
if (writer_pool->find(dump_root) == writer_pool->end()) { if (writer_pool->find(dump_root) == writer_pool->end()) {
std::unique_ptr<DebugEventsWriter> writer( std::unique_ptr<DebugEventsWriter> writer(
new DebugEventsWriter(dump_root, circular_buffer_size)); new DebugEventsWriter(dump_root, tfdbg_run_id, circular_buffer_size));
writer_pool->insert(std::make_pair(dump_root, std::move(writer))); writer_pool->insert(std::make_pair(dump_root, std::move(writer)));
} }
return (*writer_pool)[dump_root].get(); return (*writer_pool)[dump_root].get();
} }
// static // static
DebugEventsWriter* DebugEventsWriter::GetDebugEventsWriter( Status DebugEventsWriter::LookUpDebugEventsWriter(
const string& dump_root) { const string& dump_root, DebugEventsWriter** debug_events_writer) {
return DebugEventsWriter::GetDebugEventsWriter(dump_root, mutex_lock l(DebugEventsWriter::factory_mu_);
kDefaultCyclicBufferSize); std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>* writer_pool =
DebugEventsWriter::GetDebugEventsWriterMap();
if (writer_pool->find(dump_root) == writer_pool->end()) {
return errors::FailedPrecondition(
"No DebugEventsWriter has been created at dump root ", dump_root);
}
*debug_events_writer = (*writer_pool)[dump_root].get();
return Status::OK();
} }
Status DebugEventsWriter::Init() { Status DebugEventsWriter::Init() {
@ -179,6 +187,7 @@ Status DebugEventsWriter::Init() {
metadata->set_tensorflow_version(TF_VERSION_STRING); metadata->set_tensorflow_version(TF_VERSION_STRING);
metadata->set_file_version( metadata->set_file_version(
strings::Printf("%s%d", kVersionPrefix, kCurrentFormatVersion)); strings::Printf("%s%d", kVersionPrefix, kCurrentFormatVersion));
metadata->set_tfdbg_run_id(tfdbg_run_id_);
TF_RETURN_IF_ERROR(SerializeAndWriteDebugEvent(&debug_event, METADATA)); TF_RETURN_IF_ERROR(SerializeAndWriteDebugEvent(&debug_event, METADATA));
TF_RETURN_WITH_CONTEXT_IF_ERROR( TF_RETURN_WITH_CONTEXT_IF_ERROR(
metadata_writer_->Flush(), "Failed to flush debug event metadata writer"); metadata_writer_->Flush(), "Failed to flush debug event metadata writer");
@ -457,9 +466,11 @@ DebugEventsWriter::GetDebugEventsWriterMap() {
} }
DebugEventsWriter::DebugEventsWriter(const string& dump_root, DebugEventsWriter::DebugEventsWriter(const string& dump_root,
const string& tfdbg_run_id,
int64 circular_buffer_size) int64 circular_buffer_size)
: env_(Env::Default()), : env_(Env::Default()),
dump_root_(dump_root), dump_root_(dump_root),
tfdbg_run_id_(tfdbg_run_id),
is_initialized_(false), is_initialized_(false),
initialization_mu_(), initialization_mu_(),
circular_buffer_size_(circular_buffer_size), circular_buffer_size_(circular_buffer_size),

View File

@ -93,18 +93,27 @@ class DebugEventsWriter {
// sets of six. The singleton pattern avoids storing multiple sets in a single // sets of six. The singleton pattern avoids storing multiple sets in a single
// folder, which might cause confusion. // folder, which might cause confusion.
// //
// If an instance of DebugEventsWriter has already been created at a
// `dump_root`, calling this method with the same `dump_root` will return
// the existing instance.
//
// Args: // Args:
// dump_root: Dump root directory. If it doesn't exist, will be created. // dump_root: Dump root directory. If it doesn't exist, will be created.
// tfdbg_run_id: Debugging run ID of the writer.
// circular_buffer_size: Circular buffer size (in number of DebugEvent // circular_buffer_size: Circular buffer size (in number of DebugEvent
// protos). If set to a value <=0, will abolish the circular-buffer // protos). If set to a value <=0, will abolish the circular-buffer
// behavior. // behavior.
// Returns: // Returns:
// A pointer to a DebugEventsWriter object: a per-dump_root singleton. // A pointer to a DebugEventsWriter object: a per-dump_root singleton.
static DebugEventsWriter* GetDebugEventsWriter(const string& dump_root, static DebugEventsWriter* GetDebugEventsWriter(const string& dump_root,
const string& tfdbg_run_id,
int64 circular_buffer_size); int64 circular_buffer_size);
// Same as the 2-arg factory method above, but uses the default circular // Look up existing events writer by dump_root.
// buffer size. // If no DebugEventsWriter has been created at the dump_root, a non-OK
static DebugEventsWriter* GetDebugEventsWriter(const string& dump_root); // Status will be returned. Else an OK status will be returned, with
// the pointer to the existing instance provided by reference.
static Status LookUpDebugEventsWriter(
const string& dump_root, DebugEventsWriter** debug_events_writer);
~DebugEventsWriter(); ~DebugEventsWriter();
// Sets the debug event filenames and opens file for writing. // Sets the debug event filenames and opens file for writing.
@ -116,8 +125,8 @@ class DebugEventsWriter {
// deleted by another process), this will open a new file. // deleted by another process), this will open a new file.
Status Init(); Status Init();
// The four DebugEvent fields below are written _without_ the circular buffer. // The four DebugEvent fields below are written _without_ the circular
// Source file contents are written to the *.source_files file. // buffer. Source file contents are written to the *.source_files file.
// Takes ownership of source_file. // Takes ownership of source_file.
Status WriteSourceFile(SourceFile* source_file); Status WriteSourceFile(SourceFile* source_file);
// Stack frames are written to the *.code_locations file. // Stack frames are written to the *.code_locations file.
@ -132,9 +141,8 @@ class DebugEventsWriter {
// The two DebugEvent fields below are written to the circular buffer // The two DebugEvent fields below are written to the circular buffer
// and saved to disk only at the FlushExecutionFiles() call. // and saved to disk only at the FlushExecutionFiles() call.
// Execution events (eager execution of an op or a tf.function) are written to // Execution events (eager execution of an op or a tf.function) are written
// the *.execution file. // to the *.execution file. Takes ownership of execution.
// Takes ownership of execution.
Status WriteExecution(Execution* execution); Status WriteExecution(Execution* execution);
// Graph execution traces (graph-internal tensor values or their summaries) // Graph execution traces (graph-internal tensor values or their summaries)
// are written to the *.graph_execution_traces file. // are written to the *.graph_execution_traces file.
@ -151,8 +159,9 @@ class DebugEventsWriter {
// which the trace concerns multiple tensors, this is an empty string. // which the trace concerns multiple tensors, this is an empty string.
// output_slot: Output slot index of the op that this trace is concerned // output_slot: Output slot index of the op that this trace is concerned
// with. // with.
// tensor_debug_mode: An integer that represents the tensor-debug mode enum. // tensor_debug_mode: An integer that represents the tensor-debug mode
// tensor_value: The value of the tensor that describes the tensor(s) // enum. tensor_value: The value of the tensor that describes the
// tensor(s)
// that this trace is concerned with. The semantics of this tensor value // that this trace is concerned with. The semantics of this tensor value
// depends on the value of `tensor_debug_mode`. // depends on the value of `tensor_debug_mode`.
Status WriteGraphExecutionTrace(const string& tfdbg_context_id, Status WriteGraphExecutionTrace(const string& tfdbg_context_id,
@ -208,7 +217,8 @@ class DebugEventsWriter {
// Guards calls to the GetDebugEventsWriter() method. // Guards calls to the GetDebugEventsWriter() method.
static mutex factory_mu_; static mutex factory_mu_;
DebugEventsWriter(const string& dump_root, int64 circular_buffer_size); DebugEventsWriter(const string& dump_root, const string& tfdbg_run_id,
int64 circular_buffer_size);
// Get the path prefix. The same for all files, which differ only in the // Get the path prefix. The same for all files, which differ only in the
// suffix. // suffix.
@ -227,6 +237,7 @@ class DebugEventsWriter {
Env* env_; Env* env_;
const string dump_root_; const string dump_root_;
const string tfdbg_run_id_;
string file_prefix_; string file_prefix_;
bool is_initialized_ TF_GUARDED_BY(initialization_mu_); bool is_initialized_ TF_GUARDED_BY(initialization_mu_);

View File

@ -71,6 +71,7 @@ class DebugEventsWriterTest : public ::testing::Test {
dump_root_ = io::JoinPath( dump_root_ = io::JoinPath(
testing::TmpDir(), testing::TmpDir(),
strings::Printf("%010lld", static_cast<long long>(env()->NowMicros()))); strings::Printf("%010lld", static_cast<long long>(env()->NowMicros())));
tfdbg_run_id_ = "test_tfdbg_run_id";
} }
void TearDown() override { void TearDown() override {
@ -85,14 +86,15 @@ class DebugEventsWriterTest : public ::testing::Test {
} }
string dump_root_; string dump_root_;
string tfdbg_run_id_;
}; };
TEST_F(DebugEventsWriterTest, GetDebugEventsWriterSameRootGivesSameObject) { TEST_F(DebugEventsWriterTest, GetDebugEventsWriterSameRootGivesSameObject) {
// Test the per-dump_root_ singleton pattern. // Test the per-dump_root_ singleton pattern.
DebugEventsWriter* writer_1 = DebugEventsWriter* writer_1 = DebugEventsWriter::GetDebugEventsWriter(
DebugEventsWriter::GetDebugEventsWriter(dump_root_); dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
DebugEventsWriter* writer_2 = DebugEventsWriter* writer_2 = DebugEventsWriter::GetDebugEventsWriter(
DebugEventsWriter::GetDebugEventsWriter(dump_root_); dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
EXPECT_EQ(writer_1, writer_2); EXPECT_EQ(writer_1, writer_2);
} }
@ -103,8 +105,8 @@ TEST_F(DebugEventsWriterTest, ConcurrentGetDebugEventsWriterSameDumpRoot) {
std::vector<DebugEventsWriter*> writers; std::vector<DebugEventsWriter*> writers;
mutex mu; mutex mu;
auto fn = [this, &writers, &mu]() { auto fn = [this, &writers, &mu]() {
DebugEventsWriter* writer = DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
DebugEventsWriter::GetDebugEventsWriter(dump_root_); dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
{ {
mutex_lock l(mu); mutex_lock l(mu);
writers.push_back(writer); writers.push_back(writer);
@ -131,8 +133,9 @@ TEST_F(DebugEventsWriterTest, ConcurrentGetDebugEventsWriterDiffDumpRoots) {
auto fn = [this, &counter, &writers, &mu]() { auto fn = [this, &counter, &writers, &mu]() {
const string new_dump_root = const string new_dump_root =
io::JoinPath(dump_root_, strings::Printf("%ld", counter.fetch_add(1))); io::JoinPath(dump_root_, strings::Printf("%ld", counter.fetch_add(1)));
DebugEventsWriter* writer = DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
DebugEventsWriter::GetDebugEventsWriter(new_dump_root); new_dump_root, tfdbg_run_id_,
DebugEventsWriter::kDefaultCyclicBufferSize);
{ {
mutex_lock l(mu); mutex_lock l(mu);
writers.push_back(writer); writers.push_back(writer);
@ -151,17 +154,17 @@ TEST_F(DebugEventsWriterTest, ConcurrentGetDebugEventsWriterDiffDumpRoots) {
TEST_F(DebugEventsWriterTest, GetDebugEventsWriterDifferentRoots) { TEST_F(DebugEventsWriterTest, GetDebugEventsWriterDifferentRoots) {
// Test the DebugEventsWriters for different directories are different. // Test the DebugEventsWriters for different directories are different.
DebugEventsWriter* writer_1 = DebugEventsWriter* writer_1 = DebugEventsWriter::GetDebugEventsWriter(
DebugEventsWriter::GetDebugEventsWriter(dump_root_); dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
const string dump_root_2 = io::JoinPath(dump_root_, "subdirectory"); const string dump_root_2 = io::JoinPath(dump_root_, "subdirectory");
DebugEventsWriter* writer_2 = DebugEventsWriter* writer_2 = DebugEventsWriter::GetDebugEventsWriter(
DebugEventsWriter::GetDebugEventsWriter(dump_root_2); dump_root_2, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
EXPECT_NE(writer_1, writer_2); EXPECT_NE(writer_1, writer_2);
} }
TEST_F(DebugEventsWriterTest, GetAndInitDebugEventsWriter) { TEST_F(DebugEventsWriterTest, GetAndInitDebugEventsWriter) {
DebugEventsWriter* writer = DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
DebugEventsWriter::GetDebugEventsWriter(dump_root_); dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
TF_ASSERT_OK(writer->Init()); TF_ASSERT_OK(writer->Init());
TF_ASSERT_OK(writer->Close()); TF_ASSERT_OK(writer->Close());
@ -174,6 +177,8 @@ TEST_F(DebugEventsWriterTest, GetAndInitDebugEventsWriter) {
const string file_version = actuals[0].debug_metadata().file_version(); const string file_version = actuals[0].debug_metadata().file_version();
EXPECT_EQ(file_version.find(DebugEventsWriter::kVersionPrefix), 0); EXPECT_EQ(file_version.find(DebugEventsWriter::kVersionPrefix), 0);
EXPECT_GT(file_version.size(), strlen(DebugEventsWriter::kVersionPrefix)); EXPECT_GT(file_version.size(), strlen(DebugEventsWriter::kVersionPrefix));
// Check the tfdbg run ID.
EXPECT_EQ(actuals[0].debug_metadata().tfdbg_run_id(), "test_tfdbg_run_id");
// Verify that the .source_files file has been created and is empty. // Verify that the .source_files file has been created and is empty.
ReadDebugEventProtos(writer, DebugEventFileType::SOURCE_FILES, &actuals); ReadDebugEventProtos(writer, DebugEventFileType::SOURCE_FILES, &actuals);
@ -182,22 +187,22 @@ TEST_F(DebugEventsWriterTest, GetAndInitDebugEventsWriter) {
} }
TEST_F(DebugEventsWriterTest, CallingCloseWithoutInitIsOkay) { TEST_F(DebugEventsWriterTest, CallingCloseWithoutInitIsOkay) {
DebugEventsWriter* writer = DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
DebugEventsWriter::GetDebugEventsWriter(dump_root_); dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
TF_ASSERT_OK(writer->Close()); TF_ASSERT_OK(writer->Close());
} }
TEST_F(DebugEventsWriterTest, CallingCloseTwiceIsOkay) { TEST_F(DebugEventsWriterTest, CallingCloseTwiceIsOkay) {
DebugEventsWriter* writer = DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
DebugEventsWriter::GetDebugEventsWriter(dump_root_); dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
TF_ASSERT_OK(writer->Close()); TF_ASSERT_OK(writer->Close());
TF_ASSERT_OK(writer->Close()); TF_ASSERT_OK(writer->Close());
} }
TEST_F(DebugEventsWriterTest, ConcurrentInitCalls) { TEST_F(DebugEventsWriterTest, ConcurrentInitCalls) {
// Test that concurrent calls to Init() works correctly. // Test that concurrent calls to Init() works correctly.
DebugEventsWriter* writer = DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
DebugEventsWriter::GetDebugEventsWriter(dump_root_); dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
thread::ThreadPool* thread_pool = thread::ThreadPool* thread_pool =
new thread::ThreadPool(Env::Default(), "test_pool", 4); new thread::ThreadPool(Env::Default(), "test_pool", 4);
@ -218,6 +223,7 @@ TEST_F(DebugEventsWriterTest, ConcurrentInitCalls) {
const string file_version = actuals[0].debug_metadata().file_version(); const string file_version = actuals[0].debug_metadata().file_version();
EXPECT_EQ(file_version.find(DebugEventsWriter::kVersionPrefix), 0); EXPECT_EQ(file_version.find(DebugEventsWriter::kVersionPrefix), 0);
EXPECT_GT(file_version.size(), strlen(DebugEventsWriter::kVersionPrefix)); EXPECT_GT(file_version.size(), strlen(DebugEventsWriter::kVersionPrefix));
EXPECT_EQ(actuals[0].debug_metadata().tfdbg_run_id(), "test_tfdbg_run_id");
// Verify that the .source_files file has been created and is empty. // Verify that the .source_files file has been created and is empty.
ReadDebugEventProtos(writer, DebugEventFileType::SOURCE_FILES, &actuals); ReadDebugEventProtos(writer, DebugEventFileType::SOURCE_FILES, &actuals);
@ -227,14 +233,15 @@ TEST_F(DebugEventsWriterTest, ConcurrentInitCalls) {
TEST_F(DebugEventsWriterTest, InitTwiceDoesNotCreateNewMetadataFile) { TEST_F(DebugEventsWriterTest, InitTwiceDoesNotCreateNewMetadataFile) {
// Test that Init() is idempotent. // Test that Init() is idempotent.
DebugEventsWriter* writer = DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
DebugEventsWriter::GetDebugEventsWriter(dump_root_); dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
TF_ASSERT_OK(writer->Init()); TF_ASSERT_OK(writer->Init());
std::vector<DebugEvent> actuals; std::vector<DebugEvent> actuals;
ReadDebugEventProtos(writer, DebugEventFileType::METADATA, &actuals); ReadDebugEventProtos(writer, DebugEventFileType::METADATA, &actuals);
EXPECT_EQ(actuals.size(), 1); EXPECT_EQ(actuals.size(), 1);
EXPECT_GT(actuals[0].debug_metadata().tensorflow_version().length(), 0); EXPECT_GT(actuals[0].debug_metadata().tensorflow_version().length(), 0);
EXPECT_EQ(actuals[0].debug_metadata().tfdbg_run_id(), "test_tfdbg_run_id");
EXPECT_GE(actuals[0].debug_metadata().file_version().size(), 0); EXPECT_GE(actuals[0].debug_metadata().file_version().size(), 0);
string metadata_path_1 = string metadata_path_1 =
@ -248,12 +255,13 @@ TEST_F(DebugEventsWriterTest, InitTwiceDoesNotCreateNewMetadataFile) {
ReadDebugEventProtos(writer, DebugEventFileType::METADATA, &actuals); ReadDebugEventProtos(writer, DebugEventFileType::METADATA, &actuals);
EXPECT_EQ(actuals.size(), 1); EXPECT_EQ(actuals.size(), 1);
EXPECT_GT(actuals[0].debug_metadata().tensorflow_version().length(), 0); EXPECT_GT(actuals[0].debug_metadata().tensorflow_version().length(), 0);
EXPECT_EQ(actuals[0].debug_metadata().tfdbg_run_id(), "test_tfdbg_run_id");
EXPECT_GE(actuals[0].debug_metadata().file_version().size(), 0); EXPECT_GE(actuals[0].debug_metadata().file_version().size(), 0);
} }
TEST_F(DebugEventsWriterTest, WriteSourceFile) { TEST_F(DebugEventsWriterTest, WriteSourceFile) {
DebugEventsWriter* writer = DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
DebugEventsWriter::GetDebugEventsWriter(dump_root_); dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
TF_ASSERT_OK(writer->Init()); TF_ASSERT_OK(writer->Init());
SourceFile* source_file_1 = new SourceFile(); SourceFile* source_file_1 = new SourceFile();
@ -313,8 +321,8 @@ TEST_F(DebugEventsWriterTest, WriteSourceFile) {
} }
TEST_F(DebugEventsWriterTest, WriteStackFramesFile) { TEST_F(DebugEventsWriterTest, WriteStackFramesFile) {
DebugEventsWriter* writer = DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
DebugEventsWriter::GetDebugEventsWriter(dump_root_); dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
TF_ASSERT_OK(writer->Init()); TF_ASSERT_OK(writer->Init());
StackFrameWithId* stack_frame_1 = new StackFrameWithId(); StackFrameWithId* stack_frame_1 = new StackFrameWithId();
@ -375,8 +383,8 @@ TEST_F(DebugEventsWriterTest, WriteStackFramesFile) {
} }
TEST_F(DebugEventsWriterTest, WriteGraphOpCreationAndDebuggedGraph) { TEST_F(DebugEventsWriterTest, WriteGraphOpCreationAndDebuggedGraph) {
DebugEventsWriter* writer = DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
DebugEventsWriter::GetDebugEventsWriter(dump_root_); dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
TF_ASSERT_OK(writer->Init()); TF_ASSERT_OK(writer->Init());
GraphOpCreation* graph_op_creation = new GraphOpCreation(); GraphOpCreation* graph_op_creation = new GraphOpCreation();
@ -415,8 +423,8 @@ TEST_F(DebugEventsWriterTest, WriteGraphOpCreationAndDebuggedGraph) {
TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheSameFile) { TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheSameFile) {
const size_t kConcurrentWrites = 100; const size_t kConcurrentWrites = 100;
DebugEventsWriter* writer = DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
DebugEventsWriter::GetDebugEventsWriter(dump_root_); dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
TF_ASSERT_OK(writer->Init()); TF_ASSERT_OK(writer->Init());
thread::ThreadPool* thread_pool = thread::ThreadPool* thread_pool =
@ -456,8 +464,8 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheSameFile) {
TEST_F(DebugEventsWriterTest, ConcurrentWriteAndFlushCallsToTheSameFile) { TEST_F(DebugEventsWriterTest, ConcurrentWriteAndFlushCallsToTheSameFile) {
const size_t kConcurrentWrites = 100; const size_t kConcurrentWrites = 100;
DebugEventsWriter* writer = DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
DebugEventsWriter::GetDebugEventsWriter(dump_root_); dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
TF_ASSERT_OK(writer->Init()); TF_ASSERT_OK(writer->Init());
thread::ThreadPool* thread_pool = thread::ThreadPool* thread_pool =
@ -498,8 +506,8 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteAndFlushCallsToTheSameFile) {
TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheDifferentFiles) { TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheDifferentFiles) {
const int32 kConcurrentWrites = 30; const int32 kConcurrentWrites = 30;
DebugEventsWriter* writer = DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
DebugEventsWriter::GetDebugEventsWriter(dump_root_); dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
TF_ASSERT_OK(writer->Init()); TF_ASSERT_OK(writer->Init());
thread::ThreadPool* thread_pool = thread::ThreadPool* thread_pool =
@ -576,8 +584,8 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheDifferentFiles) {
TEST_F(DebugEventsWriterTest, WriteExecutionWithCyclicBufferNoFlush) { TEST_F(DebugEventsWriterTest, WriteExecutionWithCyclicBufferNoFlush) {
// Verify that no writing to disk happens until the flushing method is called. // Verify that no writing to disk happens until the flushing method is called.
const size_t kCyclicBufferSize = 10; const size_t kCyclicBufferSize = 10;
DebugEventsWriter* writer = DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
DebugEventsWriter::GetDebugEventsWriter(dump_root_, kCyclicBufferSize); dump_root_, tfdbg_run_id_, kCyclicBufferSize);
TF_ASSERT_OK(writer->Init()); TF_ASSERT_OK(writer->Init());
// First, try writing and flushing more debug events than the capacity // First, try writing and flushing more debug events than the capacity
@ -601,8 +609,8 @@ TEST_F(DebugEventsWriterTest, WriteExecutionWithCyclicBufferNoFlush) {
TEST_F(DebugEventsWriterTest, WriteExecutionWithCyclicBufferFlush) { TEST_F(DebugEventsWriterTest, WriteExecutionWithCyclicBufferFlush) {
// Verify that writing to disk happens when the flushing method is called. // Verify that writing to disk happens when the flushing method is called.
const size_t kCyclicBufferSize = 10; const size_t kCyclicBufferSize = 10;
DebugEventsWriter* writer = DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
DebugEventsWriter::GetDebugEventsWriter(dump_root_, kCyclicBufferSize); dump_root_, tfdbg_run_id_, kCyclicBufferSize);
TF_ASSERT_OK(writer->Init()); TF_ASSERT_OK(writer->Init());
// First, try writing and flushing more debug events than the capacity // First, try writing and flushing more debug events than the capacity
@ -673,8 +681,8 @@ TEST_F(DebugEventsWriterTest, WriteExecutionWithCyclicBufferFlush) {
TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferNoFlush) { TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferNoFlush) {
// Check no writing to disk happens before the flushing method is called. // Check no writing to disk happens before the flushing method is called.
const size_t kCyclicBufferSize = 10; const size_t kCyclicBufferSize = 10;
DebugEventsWriter* writer = DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
DebugEventsWriter::GetDebugEventsWriter(dump_root_, kCyclicBufferSize); dump_root_, tfdbg_run_id_, kCyclicBufferSize);
TF_ASSERT_OK(writer->Init()); TF_ASSERT_OK(writer->Init());
// First, try writing and flushing more debug events than the capacity // First, try writing and flushing more debug events than the capacity
@ -697,8 +705,8 @@ TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferNoFlush) {
TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithoutPreviousInitCall) { TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithoutPreviousInitCall) {
const size_t kCyclicBufferSize = -1; const size_t kCyclicBufferSize = -1;
DebugEventsWriter* writer = DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
DebugEventsWriter::GetDebugEventsWriter(dump_root_, kCyclicBufferSize); dump_root_, tfdbg_run_id_, kCyclicBufferSize);
// NOTE(cais): `writer->Init()` is not called here before // NOTE(cais): `writer->Init()` is not called here before
// WriteGraphExecutionTrace() is called. This test checks that this is okay // WriteGraphExecutionTrace() is called. This test checks that this is okay
// and the `GraphExecutionTrace` gets written correctly even without `Init()` // and the `GraphExecutionTrace` gets written correctly even without `Init()`
@ -722,8 +730,8 @@ TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithoutPreviousInitCall) {
TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferFlush) { TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferFlush) {
const size_t kCyclicBufferSize = 10; const size_t kCyclicBufferSize = 10;
DebugEventsWriter* writer = DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
DebugEventsWriter::GetDebugEventsWriter(dump_root_, kCyclicBufferSize); dump_root_, tfdbg_run_id_, kCyclicBufferSize);
TF_ASSERT_OK(writer->Init()); TF_ASSERT_OK(writer->Init());
// First, try writing and flushing more debug events than the capacity // First, try writing and flushing more debug events than the capacity
@ -788,8 +796,8 @@ TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferFlush) {
} }
TEST_F(DebugEventsWriterTest, RegisterDeviceAndGetIdTrace) { TEST_F(DebugEventsWriterTest, RegisterDeviceAndGetIdTrace) {
DebugEventsWriter* writer = DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
DebugEventsWriter::GetDebugEventsWriter(dump_root_); dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
TF_ASSERT_OK(writer->Init()); TF_ASSERT_OK(writer->Init());
// Register and get some device IDs in a concurrent fashion. // Register and get some device IDs in a concurrent fashion.
@ -833,8 +841,8 @@ TEST_F(DebugEventsWriterTest, RegisterDeviceAndGetIdTrace) {
TEST_F(DebugEventsWriterTest, DisableCyclicBufferBehavior) { TEST_F(DebugEventsWriterTest, DisableCyclicBufferBehavior) {
const size_t kCyclicBufferSize = 0; // A value <= 0 disables cyclic behavior. const size_t kCyclicBufferSize = 0; // A value <= 0 disables cyclic behavior.
DebugEventsWriter* writer = DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
DebugEventsWriter::GetDebugEventsWriter(dump_root_, kCyclicBufferSize); dump_root_, tfdbg_run_id_, kCyclicBufferSize);
TF_ASSERT_OK(writer->Init()); TF_ASSERT_OK(writer->Init());
const size_t kNumEvents = 20; const size_t kNumEvents = 20;

View File

@ -29,9 +29,10 @@ PYBIND11_MODULE(_pywrap_debug_events_writer, m) {
using namespace tensorflow::tfdbg; // NOLINT(build/namespaces) using namespace tensorflow::tfdbg; // NOLINT(build/namespaces)
m.def("Init", m.def("Init",
[](const std::string& dump_root, const int64 circular_buffer_size) { [](const std::string& dump_root, const std::string& tfdbg_run_id,
const int64 circular_buffer_size) {
DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter( DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
dump_root, circular_buffer_size); dump_root, tfdbg_run_id, circular_buffer_size);
if (!writer->Init().ok()) { if (!writer->Init().ok()) {
throw py::value_error(tensorflow::strings::Printf( throw py::value_error(tensorflow::strings::Printf(
"Failed to initialize debug events writer at: %s", "Failed to initialize debug events writer at: %s",
@ -41,8 +42,9 @@ PYBIND11_MODULE(_pywrap_debug_events_writer, m) {
m.def("WriteSourceFile", m.def("WriteSourceFile",
[](const std::string& dump_root, const py::object obj) { [](const std::string& dump_root, const py::object obj) {
CheckProtoType(obj, "tensorflow.DebugEvent"); CheckProtoType(obj, "tensorflow.DebugEvent");
DebugEventsWriter* writer = DebugEventsWriter* writer = nullptr;
DebugEventsWriter::GetDebugEventsWriter(dump_root); TF_CHECK_OK(
DebugEventsWriter::LookUpDebugEventsWriter(dump_root, &writer));
writer->WriteSerializedNonExecutionDebugEvent( writer->WriteSerializedNonExecutionDebugEvent(
obj.attr("SerializeToString")().cast<std::string>(), obj.attr("SerializeToString")().cast<std::string>(),
tfdbg::DebugEventFileType::SOURCE_FILES); tfdbg::DebugEventFileType::SOURCE_FILES);
@ -50,8 +52,9 @@ PYBIND11_MODULE(_pywrap_debug_events_writer, m) {
m.def("WriteStackFrameWithId", m.def("WriteStackFrameWithId",
[](const std::string& dump_root, const py::object& obj) { [](const std::string& dump_root, const py::object& obj) {
CheckProtoType(obj, "tensorflow.DebugEvent"); CheckProtoType(obj, "tensorflow.DebugEvent");
DebugEventsWriter* writer = DebugEventsWriter* writer = nullptr;
DebugEventsWriter::GetDebugEventsWriter(dump_root); TF_CHECK_OK(
DebugEventsWriter::LookUpDebugEventsWriter(dump_root, &writer));
writer->WriteSerializedNonExecutionDebugEvent( writer->WriteSerializedNonExecutionDebugEvent(
obj.attr("SerializeToString")().cast<std::string>(), obj.attr("SerializeToString")().cast<std::string>(),
tfdbg::DebugEventFileType::STACK_FRAMES); tfdbg::DebugEventFileType::STACK_FRAMES);
@ -59,8 +62,9 @@ PYBIND11_MODULE(_pywrap_debug_events_writer, m) {
m.def("WriteGraphOpCreation", m.def("WriteGraphOpCreation",
[](const std::string& dump_root, const py::object& obj) { [](const std::string& dump_root, const py::object& obj) {
CheckProtoType(obj, "tensorflow.DebugEvent"); CheckProtoType(obj, "tensorflow.DebugEvent");
DebugEventsWriter* writer = DebugEventsWriter* writer = nullptr;
DebugEventsWriter::GetDebugEventsWriter(dump_root); TF_CHECK_OK(
DebugEventsWriter::LookUpDebugEventsWriter(dump_root, &writer));
writer->WriteSerializedNonExecutionDebugEvent( writer->WriteSerializedNonExecutionDebugEvent(
obj.attr("SerializeToString")().cast<std::string>(), obj.attr("SerializeToString")().cast<std::string>(),
tfdbg::DebugEventFileType::GRAPHS); tfdbg::DebugEventFileType::GRAPHS);
@ -68,8 +72,9 @@ PYBIND11_MODULE(_pywrap_debug_events_writer, m) {
m.def("WriteDebuggedGraph", m.def("WriteDebuggedGraph",
[](const std::string& dump_root, const py::object& obj) { [](const std::string& dump_root, const py::object& obj) {
CheckProtoType(obj, "tensorflow.DebugEvent"); CheckProtoType(obj, "tensorflow.DebugEvent");
DebugEventsWriter* writer = DebugEventsWriter* writer = nullptr;
DebugEventsWriter::GetDebugEventsWriter(dump_root); TF_CHECK_OK(
DebugEventsWriter::LookUpDebugEventsWriter(dump_root, &writer));
writer->WriteSerializedNonExecutionDebugEvent( writer->WriteSerializedNonExecutionDebugEvent(
obj.attr("SerializeToString")().cast<std::string>(), obj.attr("SerializeToString")().cast<std::string>(),
tfdbg::DebugEventFileType::GRAPHS); tfdbg::DebugEventFileType::GRAPHS);
@ -77,8 +82,9 @@ PYBIND11_MODULE(_pywrap_debug_events_writer, m) {
m.def("WriteExecution", m.def("WriteExecution",
[](const std::string& dump_root, const py::object& obj) { [](const std::string& dump_root, const py::object& obj) {
CheckProtoType(obj, "tensorflow.DebugEvent"); CheckProtoType(obj, "tensorflow.DebugEvent");
DebugEventsWriter* writer = DebugEventsWriter* writer = nullptr;
DebugEventsWriter::GetDebugEventsWriter(dump_root); TF_CHECK_OK(
DebugEventsWriter::LookUpDebugEventsWriter(dump_root, &writer));
writer->WriteSerializedExecutionDebugEvent( writer->WriteSerializedExecutionDebugEvent(
obj.attr("SerializeToString")().cast<std::string>(), obj.attr("SerializeToString")().cast<std::string>(),
tfdbg::DebugEventFileType::EXECUTION); tfdbg::DebugEventFileType::EXECUTION);
@ -86,31 +92,32 @@ PYBIND11_MODULE(_pywrap_debug_events_writer, m) {
m.def("WriteGraphExecutionTrace", m.def("WriteGraphExecutionTrace",
[](const std::string& dump_root, const py::object& obj) { [](const std::string& dump_root, const py::object& obj) {
CheckProtoType(obj, "tensorflow.DebugEvent"); CheckProtoType(obj, "tensorflow.DebugEvent");
DebugEventsWriter* writer = DebugEventsWriter* writer = nullptr;
DebugEventsWriter::GetDebugEventsWriter(dump_root); TF_CHECK_OK(
DebugEventsWriter::LookUpDebugEventsWriter(dump_root, &writer));
writer->WriteSerializedExecutionDebugEvent( writer->WriteSerializedExecutionDebugEvent(
obj.attr("SerializeToString")().cast<std::string>(), obj.attr("SerializeToString")().cast<std::string>(),
tfdbg::DebugEventFileType::GRAPH_EXECUTION_TRACES); tfdbg::DebugEventFileType::GRAPH_EXECUTION_TRACES);
}); });
m.def("RegisterDeviceAndGetId", m.def("RegisterDeviceAndGetId", [](const std::string& dump_root,
[](const std::string& dump_root, const std::string& device_name) { const std::string& device_name) {
DebugEventsWriter* writer = DebugEventsWriter* writer = nullptr;
DebugEventsWriter::GetDebugEventsWriter(dump_root); TF_CHECK_OK(DebugEventsWriter::LookUpDebugEventsWriter(dump_root, &writer));
return writer->RegisterDeviceAndGetId(device_name); return writer->RegisterDeviceAndGetId(device_name);
}); });
m.def("FlushNonExecutionFiles", [](const std::string& dump_root) { m.def("FlushNonExecutionFiles", [](const std::string& dump_root) {
DebugEventsWriter* writer = DebugEventsWriter* writer = nullptr;
DebugEventsWriter::GetDebugEventsWriter(dump_root); TF_CHECK_OK(DebugEventsWriter::LookUpDebugEventsWriter(dump_root, &writer));
writer->FlushNonExecutionFiles(); writer->FlushNonExecutionFiles();
}); });
m.def("FlushExecutionFiles", [](const std::string& dump_root) { m.def("FlushExecutionFiles", [](const std::string& dump_root) {
DebugEventsWriter* writer = DebugEventsWriter* writer = nullptr;
DebugEventsWriter::GetDebugEventsWriter(dump_root); TF_CHECK_OK(DebugEventsWriter::LookUpDebugEventsWriter(dump_root, &writer));
writer->FlushExecutionFiles(); writer->FlushExecutionFiles();
}); });
m.def("Close", [](const std::string& dump_root) { m.def("Close", [](const std::string& dump_root) {
DebugEventsWriter* writer = DebugEventsWriter* writer = nullptr;
DebugEventsWriter::GetDebugEventsWriter(dump_root); TF_CHECK_OK(DebugEventsWriter::LookUpDebugEventsWriter(dump_root, &writer));
writer->Close(); writer->Close();
}); });
}; };

View File

@ -863,6 +863,7 @@ class DebugDataReader(object):
debug_event = next(metadata_iter).debug_event debug_event = next(metadata_iter).debug_event
self._starting_wall_time = debug_event.wall_time self._starting_wall_time = debug_event.wall_time
self._tensorflow_version = debug_event.debug_metadata.tensorflow_version self._tensorflow_version = debug_event.debug_metadata.tensorflow_version
self._tfdbg_run_id = debug_event.debug_metadata.tfdbg_run_id
def _load_source_files(self): def _load_source_files(self):
"""Incrementally read the .source_files DebugEvent file.""" """Incrementally read the .source_files DebugEvent file."""
@ -1071,6 +1072,10 @@ class DebugDataReader(object):
""" """
return self._tensorflow_version return self._tensorflow_version
def tfdbg_run_id(self):
"""Get the debugger run ID of the debugged TensorFlow program."""
return self._tfdbg_run_id
def outermost_graphs(self): def outermost_graphs(self):
"""Get the number of outer most graphs read so far.""" """Get the number of outer most graphs read so far."""
return [graph for graph in self._graph_by_id.values() return [graph for graph in self._graph_by_id.values()

View File

@ -32,6 +32,7 @@ class DebugEventsWriter(object):
def __init__(self, def __init__(self,
dump_root, dump_root,
tfdbg_run_id,
circular_buffer_size=DEFAULT_CIRCULAR_BUFFER_SIZE): circular_buffer_size=DEFAULT_CIRCULAR_BUFFER_SIZE):
"""Construct a DebugEventsWriter object. """Construct a DebugEventsWriter object.
@ -43,6 +44,7 @@ class DebugEventsWriter(object):
Args: Args:
dump_root: The root directory for dumping debug data. If `dump_root` does dump_root: The root directory for dumping debug data. If `dump_root` does
not exist as a directory, it will be created. not exist as a directory, it will be created.
tfdbg_run_id: Debugger Run ID.
circular_buffer_size: Size of the circular buffer for each of the two circular_buffer_size: Size of the circular buffer for each of the two
execution-related debug events files: with the following suffixes: - execution-related debug events files: with the following suffixes: -
.execution - .graph_execution_traces If <= 0, the circular-buffer .execution - .graph_execution_traces If <= 0, the circular-buffer
@ -51,7 +53,9 @@ class DebugEventsWriter(object):
if not dump_root: if not dump_root:
raise ValueError("Empty or None dump root") raise ValueError("Empty or None dump root")
self._dump_root = dump_root self._dump_root = dump_root
_pywrap_debug_events_writer.Init(self._dump_root, circular_buffer_size) self._tfdbg_run_id = tfdbg_run_id
_pywrap_debug_events_writer.Init(self._dump_root, self._tfdbg_run_id,
circular_buffer_size)
def WriteSourceFile(self, source_file): def WriteSourceFile(self, source_file):
"""Write a SourceFile proto with the writer. """Write a SourceFile proto with the writer.

View File

@ -41,7 +41,7 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
def testMultiThreadedConstructorCallWorks(self): def testMultiThreadedConstructorCallWorks(self):
def init_writer(): def init_writer():
debug_events_writer.DebugEventsWriter(self.dump_root) debug_events_writer.DebugEventsWriter(self.dump_root, self.tfdbg_run_id)
num_threads = 4 num_threads = 4
threads = [] threads = []
@ -66,7 +66,8 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
self._readAndCheckMetadataFile() self._readAndCheckMetadataFile()
def testWriteSourceFilesAndStackFrames(self): def testWriteSourceFilesAndStackFrames(self):
writer = debug_events_writer.DebugEventsWriter(self.dump_root) writer = debug_events_writer.DebugEventsWriter(self.dump_root,
self.tfdbg_run_id)
num_protos = 10 num_protos = 10
for i in range(num_protos): for i in range(num_protos):
source_file = debug_event_pb2.SourceFile() source_file = debug_event_pb2.SourceFile()
@ -99,7 +100,8 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
self.assertEqual(actuals[i].file_line_col.file_index, i * 10) self.assertEqual(actuals[i].file_line_col.file_index, i * 10)
def testWriteGraphOpCreationAndDebuggedGraphs(self): def testWriteGraphOpCreationAndDebuggedGraphs(self):
writer = debug_events_writer.DebugEventsWriter(self.dump_root) writer = debug_events_writer.DebugEventsWriter(self.dump_root,
self.tfdbg_run_id)
num_op_creations = 10 num_op_creations = 10
for i in range(num_op_creations): for i in range(num_op_creations):
graph_op_creation = debug_event_pb2.GraphOpCreation() graph_op_creation = debug_event_pb2.GraphOpCreation()
@ -122,7 +124,8 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
"deadbeaf") "deadbeaf")
def testConcurrentWritesToNonExecutionFilesWorks(self): def testConcurrentWritesToNonExecutionFilesWorks(self):
writer = debug_events_writer.DebugEventsWriter(self.dump_root) writer = debug_events_writer.DebugEventsWriter(self.dump_root,
self.tfdbg_run_id)
source_file_state = {"counter": 0, "lock": threading.Lock()} source_file_state = {"counter": 0, "lock": threading.Lock()}
@ -201,15 +204,18 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
def testWriteAndReadMetadata(self): def testWriteAndReadMetadata(self):
t0 = time.time() t0 = time.time()
writer = debug_events_writer.DebugEventsWriter(self.dump_root) writer = debug_events_writer.DebugEventsWriter(self.dump_root,
self.tfdbg_run_id)
writer.Close() writer.Close()
with debug_events_reader.DebugDataReader(self.dump_root) as reader: with debug_events_reader.DebugDataReader(self.dump_root) as reader:
self.assertIsInstance(reader.starting_wall_time(), float) self.assertIsInstance(reader.starting_wall_time(), float)
self.assertGreaterEqual(reader.starting_wall_time(), t0) self.assertGreaterEqual(reader.starting_wall_time(), t0)
self.assertEqual(reader.tensorflow_version(), versions.__version__) self.assertEqual(reader.tensorflow_version(), versions.__version__)
self.assertTrue(reader.tfdbg_run_id())
def testWriteExecutionEventsWithCircularBuffer(self): def testWriteExecutionEventsWithCircularBuffer(self):
writer = debug_events_writer.DebugEventsWriter(self.dump_root) writer = debug_events_writer.DebugEventsWriter(self.dump_root,
self.tfdbg_run_id)
num_execution_events = debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE * 2 num_execution_events = debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE * 2
for i in range(num_execution_events): for i in range(num_execution_events):
execution = debug_event_pb2.Execution() execution = debug_event_pb2.Execution()
@ -232,7 +238,8 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
def testWriteExecutionEventsWithoutCircularBufferBehavior(self): def testWriteExecutionEventsWithoutCircularBufferBehavior(self):
# A circular buffer size of 0 abolishes the circular buffer behavior. # A circular buffer size of 0 abolishes the circular buffer behavior.
writer = debug_events_writer.DebugEventsWriter(self.dump_root, 0) writer = debug_events_writer.DebugEventsWriter(self.dump_root,
self.tfdbg_run_id, 0)
num_execution_events = debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE * 2 num_execution_events = debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE * 2
for i in range(num_execution_events): for i in range(num_execution_events):
execution = debug_event_pb2.Execution() execution = debug_event_pb2.Execution()
@ -248,7 +255,8 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
self.assertEqual(execution.op_type, "OpType%d" % i) self.assertEqual(execution.op_type, "OpType%d" % i)
def testWriteGraphExecutionTraceEventsWithCircularBuffer(self): def testWriteGraphExecutionTraceEventsWithCircularBuffer(self):
writer = debug_events_writer.DebugEventsWriter(self.dump_root) writer = debug_events_writer.DebugEventsWriter(self.dump_root,
self.tfdbg_run_id)
num_execution_events = debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE * 2 num_execution_events = debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE * 2
for i in range(num_execution_events): for i in range(num_execution_events):
trace = debug_event_pb2.GraphExecutionTrace() trace = debug_event_pb2.GraphExecutionTrace()
@ -272,7 +280,8 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
def testWriteGraphExecutionTraceEventsWithoutCircularBufferBehavior(self): def testWriteGraphExecutionTraceEventsWithoutCircularBufferBehavior(self):
# A circular buffer size of 0 abolishes the circular buffer behavior. # A circular buffer size of 0 abolishes the circular buffer behavior.
writer = debug_events_writer.DebugEventsWriter(self.dump_root, 0) writer = debug_events_writer.DebugEventsWriter(self.dump_root,
self.tfdbg_run_id, 0)
num_execution_events = debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE * 2 num_execution_events = debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE * 2
for i in range(num_execution_events): for i in range(num_execution_events):
trace = debug_event_pb2.GraphExecutionTrace() trace = debug_event_pb2.GraphExecutionTrace()
@ -290,6 +299,7 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
def testConcurrentWritesToExecutionFiles(self): def testConcurrentWritesToExecutionFiles(self):
circular_buffer_size = 5 circular_buffer_size = 5
writer = debug_events_writer.DebugEventsWriter(self.dump_root, writer = debug_events_writer.DebugEventsWriter(self.dump_root,
self.tfdbg_run_id,
circular_buffer_size) circular_buffer_size)
debugged_graph = debug_event_pb2.DebuggedGraph(graph_id="graph1", debugged_graph = debug_event_pb2.DebuggedGraph(graph_id="graph1",
graph_name="graph1") graph_name="graph1")
@ -345,7 +355,8 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
self.assertLen(op_names, len(set(op_names))) self.assertLen(op_names, len(set(op_names)))
def testConcurrentSourceFileRandomReads(self): def testConcurrentSourceFileRandomReads(self):
writer = debug_events_writer.DebugEventsWriter(self.dump_root) writer = debug_events_writer.DebugEventsWriter(self.dump_root,
self.tfdbg_run_id)
for i in range(100): for i in range(100):
source_file = debug_event_pb2.SourceFile( source_file = debug_event_pb2.SourceFile(
@ -376,6 +387,7 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
def testConcurrentExecutionUpdateAndRandomRead(self): def testConcurrentExecutionUpdateAndRandomRead(self):
circular_buffer_size = -1 circular_buffer_size = -1
writer = debug_events_writer.DebugEventsWriter(self.dump_root, writer = debug_events_writer.DebugEventsWriter(self.dump_root,
self.tfdbg_run_id,
circular_buffer_size) circular_buffer_size)
writer_state = {"counter": 0, "done": False} writer_state = {"counter": 0, "done": False}
@ -410,6 +422,7 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
def testConcurrentExecutionRandomReads(self): def testConcurrentExecutionRandomReads(self):
circular_buffer_size = -1 circular_buffer_size = -1
writer = debug_events_writer.DebugEventsWriter(self.dump_root, writer = debug_events_writer.DebugEventsWriter(self.dump_root,
self.tfdbg_run_id,
circular_buffer_size) circular_buffer_size)
for i in range(100): for i in range(100):
@ -445,6 +458,7 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
def testConcurrentGraphExecutionTraceUpdateAndRandomRead(self): def testConcurrentGraphExecutionTraceUpdateAndRandomRead(self):
circular_buffer_size = -1 circular_buffer_size = -1
writer = debug_events_writer.DebugEventsWriter(self.dump_root, writer = debug_events_writer.DebugEventsWriter(self.dump_root,
self.tfdbg_run_id,
circular_buffer_size) circular_buffer_size)
debugged_graph = debug_event_pb2.DebuggedGraph(graph_id="graph1", debugged_graph = debug_event_pb2.DebuggedGraph(graph_id="graph1",
graph_name="graph1") graph_name="graph1")
@ -487,6 +501,7 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
def testConcurrentGraphExecutionTraceRandomReads(self): def testConcurrentGraphExecutionTraceRandomReads(self):
circular_buffer_size = -1 circular_buffer_size = -1
writer = debug_events_writer.DebugEventsWriter(self.dump_root, writer = debug_events_writer.DebugEventsWriter(self.dump_root,
self.tfdbg_run_id,
circular_buffer_size) circular_buffer_size)
debugged_graph = debug_event_pb2.DebuggedGraph(graph_id="graph1", debugged_graph = debug_event_pb2.DebuggedGraph(graph_id="graph1",
graph_name="graph1") graph_name="graph1")
@ -534,7 +549,7 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
def testRangeReadingExecutions(self, begin, end, expected_begin, def testRangeReadingExecutions(self, begin, end, expected_begin,
expected_end): expected_end):
writer = debug_events_writer.DebugEventsWriter( writer = debug_events_writer.DebugEventsWriter(
self.dump_root, circular_buffer_size=-1) self.dump_root, self.tfdbg_run_id, circular_buffer_size=-1)
for i in range(5): for i in range(5):
execution = debug_event_pb2.Execution(op_type="OpType%d" % i) execution = debug_event_pb2.Execution(op_type="OpType%d" % i)
writer.WriteExecution(execution) writer.WriteExecution(execution)
@ -559,7 +574,7 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
def testRangeReadingGraphExecutionTraces(self, begin, end, expected_begin, def testRangeReadingGraphExecutionTraces(self, begin, end, expected_begin,
expected_end): expected_end):
writer = debug_events_writer.DebugEventsWriter( writer = debug_events_writer.DebugEventsWriter(
self.dump_root, circular_buffer_size=-1) self.dump_root, self.tfdbg_run_id, circular_buffer_size=-1)
debugged_graph = debug_event_pb2.DebuggedGraph( debugged_graph = debug_event_pb2.DebuggedGraph(
graph_id="graph1", graph_name="graph1") graph_id="graph1", graph_name="graph1")
writer.WriteDebuggedGraph(debugged_graph) writer.WriteDebuggedGraph(debugged_graph)

View File

@ -52,8 +52,9 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase):
super(DebugIdentityV2OpTest, self).setUp() super(DebugIdentityV2OpTest, self).setUp()
# Testing using a small circular-buffer size. # Testing using a small circular-buffer size.
self.circular_buffer_size = 4 self.circular_buffer_size = 4
self.tfdbg_run_id = "test_tfdbg_run"
self.writer = debug_events_writer.DebugEventsWriter( self.writer = debug_events_writer.DebugEventsWriter(
self.dump_root, self.circular_buffer_size) self.dump_root, self.tfdbg_run_id, self.circular_buffer_size)
def tearDown(self): def tearDown(self):
self.writer.Close() self.writer.Close()
@ -192,7 +193,8 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase):
def testTwoDumpRoots(self): def testTwoDumpRoots(self):
another_dump_root = os.path.join(self.dump_root, "another") another_dump_root = os.path.join(self.dump_root, "another")
another_debug_url = "file://%s" % another_dump_root another_debug_url = "file://%s" % another_dump_root
another_writer = debug_events_writer.DebugEventsWriter(another_dump_root) another_writer = debug_events_writer.DebugEventsWriter(
another_dump_root, "test_tfdbg_run")
@def_function.function @def_function.function
def write_debug_trace(x): def write_debug_trace(x):
@ -264,6 +266,7 @@ class DebugIdentityV2OpUninitializedWriterTest(
self.assertAllClose( self.assertAllClose(
write_debug_trace(np.array([i]).astype(np.float32)), [i**2.0]) write_debug_trace(np.array([i]).astype(np.float32)), [i**2.0])
writer = debug_events_writer.DebugEventsWriter(self.dump_root, writer = debug_events_writer.DebugEventsWriter(self.dump_root,
"test_tfdbg_run",
circular_buffer_size) circular_buffer_size)
writer.FlushNonExecutionFiles() writer.FlushNonExecutionFiles()
writer.FlushExecutionFiles() writer.FlushExecutionFiles()

View File

@ -69,6 +69,10 @@ def _debug_identity_v2_grad(op, dy):
return dy return dy
def _get_tfdbg_run_id():
return str(uuid.uuid4())[:8]
def _get_id(): def _get_id():
"""Get a short unique ID.""" """Get a short unique ID."""
return str(uuid.uuid4()) return str(uuid.uuid4())
@ -88,6 +92,7 @@ class _DumpingCallback(object):
op_regex, op_regex,
tensor_dtypes): tensor_dtypes):
self._dump_root = dump_root self._dump_root = dump_root
self._tfdbg_run_id = _get_tfdbg_run_id()
self._tensor_debug_mode = tensor_debug_mode self._tensor_debug_mode = tensor_debug_mode
self._circular_buffer_size = circular_buffer_size self._circular_buffer_size = circular_buffer_size
self._op_regex = op_regex self._op_regex = op_regex
@ -148,6 +153,10 @@ class _DumpingCallback(object):
self._dump_root = dump_root self._dump_root = dump_root
self._writer = None self._writer = None
@property
def tfdbg_run_id(self):
return self._tfdbg_run_id
@property @property
def tensor_debug_mode(self): def tensor_debug_mode(self):
return self._tensor_debug_mode return self._tensor_debug_mode
@ -161,6 +170,7 @@ class _DumpingCallback(object):
if not self._writer: if not self._writer:
self._writer = debug_events_writer.DebugEventsWriter( self._writer = debug_events_writer.DebugEventsWriter(
self._dump_root, self._dump_root,
self._tfdbg_run_id,
circular_buffer_size=self._circular_buffer_size) circular_buffer_size=self._circular_buffer_size)
return self._writer return self._writer
@ -365,6 +375,8 @@ class _DumpingCallback(object):
if tf_compat.forward_compatible(2020, 6, 24): if tf_compat.forward_compatible(2020, 6, 24):
debug_identity_op_kwargs[ debug_identity_op_kwargs[
"circular_buffer_size"] = self._circular_buffer_size "circular_buffer_size"] = self._circular_buffer_size
if tf_compat.forward_compatible(2020, 7, 1):
debug_identity_op_kwargs["tfdbg_run_id"] = self._tfdbg_run_id
if tensor_debug_mode == debug_event_pb2.TensorDebugMode.NO_TENSOR: if tensor_debug_mode == debug_event_pb2.TensorDebugMode.NO_TENSOR:
if (not self._should_dump_tensor(op_type, tensor.dtype) or if (not self._should_dump_tensor(op_type, tensor.dtype) or
not tensor.dtype.is_numpy_compatible): not tensor.dtype.is_numpy_compatible):
@ -873,7 +885,8 @@ def disable_dump_debug_info():
""" """
if hasattr(_state, "dumping_callback"): if hasattr(_state, "dumping_callback"):
dump_root = _state.dumping_callback.dump_root dump_root = _state.dumping_callback.dump_root
debug_events_writer.DebugEventsWriter(dump_root).Close() tfdbg_run_id = _state.dumping_callback.tfdbg_run_id
debug_events_writer.DebugEventsWriter(dump_root, tfdbg_run_id).Close()
op_callbacks.remove_op_callback(_state.dumping_callback.callback) op_callbacks.remove_op_callback(_state.dumping_callback.callback)
function_lib.remove_function_callback( function_lib.remove_function_callback(
_state.dumping_callback.function_callback) _state.dumping_callback.function_callback)

View File

@ -21,6 +21,7 @@ from __future__ import print_function
import os import os
import shutil import shutil
import tempfile import tempfile
import uuid
from tensorflow.python.debug.lib import check_numerics_callback from tensorflow.python.debug.lib import check_numerics_callback
from tensorflow.python.debug.lib import debug_events_reader from tensorflow.python.debug.lib import debug_events_reader
@ -35,6 +36,7 @@ class DumpingCallbackTestBase(test_util.TensorFlowTestCase):
def setUp(self): def setUp(self):
super(DumpingCallbackTestBase, self).setUp() super(DumpingCallbackTestBase, self).setUp()
self.dump_root = tempfile.mkdtemp() self.dump_root = tempfile.mkdtemp()
self.tfdbg_run_id = str(uuid.uuid4())
def tearDown(self): def tearDown(self):
if os.path.isdir(self.dump_root): if os.path.isdir(self.dump_root):

View File

@ -982,7 +982,7 @@ tf_module {
} }
member_method { member_method {
name: "DebugIdentityV2" name: "DebugIdentityV2"
argspec: "args=[\'input\', \'tfdbg_context_id\', \'op_name\', \'output_slot\', \'tensor_debug_mode\', \'debug_urls\', \'circular_buffer_size\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'-1\', \'-1\', \'[]\', \'1000\', \'None\'], " argspec: "args=[\'input\', \'tfdbg_context_id\', \'op_name\', \'output_slot\', \'tensor_debug_mode\', \'debug_urls\', \'circular_buffer_size\', \'tfdbg_run_id\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'-1\', \'-1\', \'[]\', \'1000\', \'\', \'None\'], "
} }
member_method { member_method {
name: "DebugNanCount" name: "DebugNanCount"

View File

@ -982,7 +982,7 @@ tf_module {
} }
member_method { member_method {
name: "DebugIdentityV2" name: "DebugIdentityV2"
argspec: "args=[\'input\', \'tfdbg_context_id\', \'op_name\', \'output_slot\', \'tensor_debug_mode\', \'debug_urls\', \'circular_buffer_size\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'-1\', \'-1\', \'[]\', \'1000\', \'None\'], " argspec: "args=[\'input\', \'tfdbg_context_id\', \'op_name\', \'output_slot\', \'tensor_debug_mode\', \'debug_urls\', \'circular_buffer_size\', \'tfdbg_run_id\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'-1\', \'-1\', \'[]\', \'1000\', \'\', \'None\'], "
} }
member_method { member_method {
name: "DebugNanCount" name: "DebugNanCount"