From abe233392e233627cab86c1b0dbdeac118885ac8 Mon Sep 17 00:00:00 2001 From: Andrew Audibert Date: Mon, 16 Nov 2020 15:23:37 -0800 Subject: [PATCH] [tf.data] Check cycle length when restoring parallel interleave iterator. If we try to restore into an iterator with a smaller cycle length from the original, it will produce a segmentation fault. This can happen either due to user error, or due to the cycle_length being autotuned. This CL is a stopgap solution to give a better error message than a segmentation fault. In the long term we aim to support adjusting the cycle_length so that autotuned cycle_length + checkpointing just works. PiperOrigin-RevId: 342733442 Change-Id: Ie9869224cc1598e74e6eb00397df35e6a1a46859 --- .../data/parallel_interleave_dataset_op.cc | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc index 8acc2f69e1d..244afede675 100644 --- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc +++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc @@ -1317,7 +1317,20 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase { mutex_lock l(*mu_); TF_RETURN_IF_ERROR( reader->ReadScalar(prefix(), kCurrentElementsSize, &size)); - DCHECK_EQ(current_elements_.size(), size); + if (current_elements_.size() != size) { + // This could mean two things: (1) the user created their checkpoint + // from a dataset with one cycle_length, then changed the cycle_length + // and tried to restore from the old checkpoint, or (2) the user set + // the cycle length to tf.data.AUTOTUNE, wrote the checkpoint from one + // machine, then tried to restore the checkpoint on another machine + // with a different CPU budget (causing autotune to pick a different + // cycle length). + return errors::FailedPrecondition( + "The iterator cycle length ", current_elements_.size(), + " is different from the cycle length to restore from the " + "checkpoint: ", + size); + } } if (size == 0) { return Status::OK();