[rollback] Support new pipelines in autosharding by including it in FILE autosharding policy

PiperOrigin-RevId: 336902256 Change-Id: I977591868b46405e57612251777fdab4206c4d71
2020-10-13 10:16:33 -07:00 · 2020-10-13 10:16:33 -07:00 · 6df9f5a51d
commit 6df9f5a51d
parent d9ad5ce61b
2 changed files with 2 additions and 102 deletions
--- a/tensorflow/core/grappler/optimizers/data/auto_shard.cc
+++ b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
@ -45,9 +45,6 @@ constexpr char kShuffleDatasetV3OpName[] = "ShuffleDatasetV3";
 constexpr char kPrefetchDatasetOpName[] = "PrefetchDataset";
 constexpr char kRebatchDatasetOpName[] = "RebatchDataset";
 constexpr char kRebatchDatasetV2OpName[] = "RebatchDatasetV2";
 constexpr char kTensorDatasetOpName[] = "TensorDataset";
 constexpr char kTensorSliceDatasetOpName[] = "TensorSliceDataset";
 constexpr char kPlaceholderOpName[] = "Placeholder";
 constexpr char kNumWorkersAttrName[] = "num_workers";
 constexpr char kNumReplicasAttrName[] = "num_replicas";
@ -71,13 +68,12 @@ constexpr std::array<const char*, 2> kMultipleInputsDatasetOps = {
    "ZipDataset"
 };
-constexpr std::array<const char*, 26> kPassThroughOps = {
+constexpr std::array<const char*, 25> kPassThroughOps = {
    "_Retval",
    "AssertNextDataset",
    "BatchDataset",
    "CacheDataset",
    "ExperimentalMapAndBatchDataset",
    "ExperimentalParseExampleDataset",
    "ExperimentalRebatchDataset",
    "FilterDataset",
    "Identity",
@ -417,33 +413,6 @@ Status ProcessDatasetSourceNode(MutableGraphView* graph, const NodeDef& node,
  return Status::OK();
 }
 const NodeDef* FindFuncAndTensorSliceDataset(
    const NodeDef* node, int64 num_workers, int64 index,
    FunctionLibraryDefinition* flib, MutableGraphView* graph,
    absl::flat_hash_set<string>* nodes_to_delete) {
  if (IsDatasetNodeOfType(*node, kFuncDatasetOps)) {
    const NodeDef* input_node = graph_utils::GetInputNode(*node, *graph, 0);
    if (input_node->op() == kTensorSliceDatasetOpName ||
        input_node->op() == kTensorDatasetOpName) {
      const NodeDef* next_input_node =
          graph_utils::GetInputNode(*input_node, *graph, 0);
      if (next_input_node->op() == kPlaceholderOpName) {
        return node;
      }
    }
  }
  if (!IsDatasetNodeOfType(*node, kPassThroughOps)) {
    return nullptr;
  }
  // Sometimes there are other nodes between the last InterleaveDataset and the
  // second to last FlatMapDataset, so we need to skip over those.
  const NodeDef* input_node = graph_utils::GetInputNode(*node, *graph, 0);
  return FindFuncAndTensorSliceDataset(input_node, num_workers, index, flib,
                                       graph, nodes_to_delete);
 }
 Status RecursivelyHandleOp(const NodeDef& node, int64 num_workers, int64 index,
                           FunctionLibraryDefinition* flib,
                           MutableGraphView* graph,
@ -472,39 +441,6 @@ Status RecursivelyHandleOp(const NodeDef& node, int64 num_workers, int64 index,
    return Status::OK();
  }
  // This handles the case for the following subgraph:
  //   Placeholder -> TensorSliceDataset -> FlatMapDataset -x->
  //   (other preprocessing datasets) -> InterleaveDataset
  // and then inserting the shard node immediately after the FlatMapDataset.
  //
  // This is used for some training pipelines where a dataset is created with
  // the following code:
  //
  // def make_dataset_pipeline():
  //   file_globs = [...]
  //   datasets = []
  //   for file_glob in file_globs:
  //     datasets.append(Dataset.list_files(file_glob).map(TFRecordReader))
  //   dataset = Dataset.from_tensor_slices(datasets)
  //   dataset = dataset.flat_map(lambda x: x)
  //   dataset = ...  # additional preprocessing
  //   dataset = dataset.interleave(lambda x: x, cycle_length=...)
  //   return dataset
  if (IsDatasetNodeOfType(node, kFuncDatasetOps)) {
    const NodeDef* input_node = graph_utils::GetInputNode(node, *graph, 0);
    const NodeDef* flat_map_node = FindFuncAndTensorSliceDataset(
        input_node, num_workers, index, flib, graph, nodes_to_delete);
    if (flat_map_node != nullptr) {
      auto fanouts = graph->GetFanouts(*flat_map_node, false);
      // FlatMapDataset should only be the input to one other dataset.
      if (fanouts.size() == 1) {
        return ProcessDatasetSourceNode(graph, *fanouts.begin()->node,
                                        nodes_to_delete, num_workers, index);
      }
    }
  }
  // This handles the case where a reader Dataset is contained within a
  // FuncDataset (e.g. FlatMap, ParallelInterleave, etc...). For example:
  //
@ -634,6 +570,7 @@ Status OptimizeGraph(const GrapplerItem& item, int64 num_workers, int64 index,
  MutableGraphView graph(output);
  FunctionLibraryDefinition flib(OpRegistry::Global(), item.graph.library());
  NodeDef* sink_node;
  TF_RETURN_IF_ERROR(graph_utils::GetFetchNode(graph, item, &sink_node));
--- a/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
@ -103,43 +103,6 @@ class AutoShardDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
    ]
    self.assertDatasetProducesWithShuffle(dataset, expected, 5, 4, shuffle)
  @combinations.generate(
      combinations.times(test_base.default_test_combinations(),
                         combinations.combine(batch_size=[1, 3, 10])))
  def testDatasetOfReaderDatasetsPipeline(self, batch_size):
    # This tests a scenario where a list_files main return multiple files
    # due to the glob containing wildcards.
    def batch(iterator, n):
      l = len(iterator)
      for i in range(0, l, n):
        yield iterator[i:min(i + n, l)]
    datasets = []
    for files in batch(self.test_filenames, batch_size):
      datasets.append(
          dataset_ops.Dataset.list_files(files, shuffle=False).map(
              core_readers.TFRecordDataset))
    dataset = dataset_ops.Dataset.from_tensor_slices(datasets)
    dataset = dataset.flat_map(lambda x: x)
    # Simulate additional ops in between flat_map and interleave. This should be
    # a no-op since if ShardDataset is placed right after flat_map, we will only
    # have two datasets left at this point.
    dataset = dataset.prefetch(1)
    dataset = dataset.prefetch(1)
    dataset = dataset.interleave(
        lambda x: x, cycle_length=1, num_parallel_calls=1)
    dataset = distribute._AutoShardDataset(dataset, 5, 0)
    expected = [
        b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
        for f in (0, 5)
        for r in range(0, 10)
    ]
    self.assertDatasetProduces(dataset, expected)
  @combinations.generate(test_base.default_test_combinations())
  def testZipReaderPipeline(self):
    dataset1 = dataset_ops.Dataset.list_files(