From 97d32f49b4afb84d08f116e4681cb48fe4b8d6f8 Mon Sep 17 00:00:00 2001 From: Jiri Simsa Date: Thu, 29 Oct 2020 17:16:08 -0700 Subject: [PATCH] [tf.data] Log a warning when incomplete file-based cache is finalized. PiperOrigin-RevId: 339778319 Change-Id: Idaccb9cbb7c315ad36efc3cb23ef8e4086db287a --- .../core/kernels/data/cache_dataset_ops.cc | 19 ++++++++++++------- .../dataset_serialization_test_base.py | 5 ++--- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc index c9883f9c938..3d0a51404ba 100644 --- a/tensorflow/core/kernels/data/cache_dataset_ops.cc +++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc @@ -38,6 +38,8 @@ namespace data { /* static */ constexpr const char* const CacheDatasetOp::kOutputTypes; /* static */ constexpr const char* const CacheDatasetOp::kOutputShapes; +namespace { + constexpr char kKeyStrFormat[] = "%%%zuzu_%%%zuzu"; constexpr char kPaddingSizeStrFormat[] = "%zu"; constexpr char kFileDatasetPrefix[] = "File"; @@ -57,6 +59,14 @@ constexpr char kCacheCompleted[] = "cache_completed"; constexpr char kIndex[] = "index"; constexpr char kImpl[] = "Impl"; constexpr char kCacheDataset[] = "CacheDataset"; +constexpr char kIncompleteCacheErrorMessage[] = + "The calling iterator did not fully read the dataset being cached. In " + "order to avoid unexpected truncation of the dataset, the partially cached " + "contents of the dataset will be discarded. This can happen if you have " + "an input pipeline similar to `dataset.cache().take(k).repeat()`. You " + "should use `dataset.take(k).cache().repeat()` instead."; + +} // namespace class CacheDatasetOp::FileDatasetBase : public DatasetBase { public: @@ -220,6 +230,7 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase { ~FileWriterIterator() override { if (!dataset()->env_->FileExists(MetaFilename(filename_)).ok()) { + LOG(WARNING) << kIncompleteCacheErrorMessage; std::vector cache_files; Status s = dataset()->env_->GetMatchingPaths( strings::StrCat(filename_, "*"), &cache_files); @@ -754,13 +765,7 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase { ~MemoryWriterIterator() override { mutex_lock l(mu_); if (!temp_cache_.empty() && !cache_->IsCompleted()) { - LOG(WARNING) - << "The calling iterator did not fully read the dataset being " - "cached. In order to avoid unexpected truncation of the " - "dataset, the partially cached contents of the dataset " - "will be discarded. This can happen if you have an input " - "pipeline similar to `dataset.cache().take(k).repeat()`. " - "You should use `dataset.take(k).cache().repeat()` instead."; + LOG(WARNING) << kIncompleteCacheErrorMessage; cache_->Reset(); } } diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py b/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py index aea4934260e..44fe30f6729 100644 --- a/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py +++ b/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py @@ -39,8 +39,7 @@ from tensorflow.python.util import nest def remove_variants(get_next_op): - # TODO(b/72408568): Remove this once session.run can get - # variant tensors. + # TODO(b/72408568): Remove this once session.run can get variant tensors. """Remove variants from a nest structure, so sess.run will execute.""" def _remove_variant(x): @@ -61,7 +60,7 @@ class DatasetSerializationTestBase(test.TestCase): # TODO(b/72657739): Remove sparse_tensor argument, which is to test the # (deprecated) saveable `SparseTensorSliceDataset`, once the API - # `from_sparse_tensor_slices()`and related tests are deleted. + # `from_sparse_tensor_slices()` and related tests are deleted. def run_core_tests(self, ds_fn, num_outputs, sparse_tensors=False): """Runs the core tests.