From ce477dd2bbaa8a59fda9a582af724e0e86cea8c6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 15 Dec 2020 12:22:48 -0800
Subject: [PATCH] Op documentation update. 	update of
 g3doc/includes/tf_passes.md

PiperOrigin-RevId: 347668072
Change-Id: I0d9cb91a18c56347844c8d3c699fcbd8f878bda4
---
 .../compiler/mlir/g3doc/includes/tf_passes.md | 76 +++++++++++++++++++
 1 file changed, 76 insertions(+)
diff --git a/tensorflow/compiler/mlir/g3doc/includes/tf_passes.md b/tensorflow/compiler/mlir/g3doc/includes/tf_passes.md
index 0182ae19cc5..5debc00a781 100644
--- a/tensorflow/compiler/mlir/g3doc/includes/tf_passes.md
+++ b/tensorflow/compiler/mlir/g3doc/includes/tf_passes.md
@@ -39,3 +39,79 @@ func @my_fn(%arg0: tensor<i32>, %arg1: tensor<i32>) -> (tensor<i32>, tensor<i32>
 ```
 -max-iterations : Maximum shape inference iterations
 ```
+### `-tf-tpu-cluster-formation`: Form clusters from operations assigned to the same TPU computation
+TPU computations from the frontend are composed of a `tf.TPUReplicateMetadata`
+op, a subgraph of ops (TensorFlow Dialect) each with a matching `_tpu_replicate`
+attribute relative to the associated `tf.TPUReplicateMetadata` op, and
+optionally `tf.TPUReplicatedInput` and `tf.TPUReplicatedOutput` ops feeding in
+inputs and outputs to and from a replicated TPU computation. The number of times
+a TPU computation is replicated is defined in the `tf.TPUReplicateMetadata` op
+(`num_replicas` attribute) and operand and result sizes of
+`tf.TPUReplicatedInput` and `tf.TPUReplicatedOutput` respectively must match,
+excluding packed tensors. It is also assumed ops of the same TPU computation do
+not have ops outside of the TPU computation that are both inputs and outputs to
+the same TPU computation.
+
+This pass takes the TPU computation subgraph, moves them into a
+`tf_device.cluster`, and copies over attributes from the associated
+`tf.TPUReplicateMetadata` op to the newly created `tf_device.cluster`. If the
+computation is replicated (`num_replicas` > 1), the `num_replicas` attribute is
+not copied over but instead the `tf_device.cluster` is further wrapped with a
+`tf_device.replicate`, and associated `tf.TPUReplicatedInput` and
+`tf.TPUReplicatedOutput` ops are replaced as the `tf_device.replicate` operands
+and results. Otherwise, the single operands and results of the associated
+`tf.TPUReplicatedInput` and `tf.TPUReplicatedOutput` ops are simply forwarded to
+the `tf_device.cluster`.
+
+For example, the following non replicated computation:
+
+```mlir
+func @tpu_computation(%arg0: tensor<i32>) -> tensor<i32> {
+  // Metadata op for cluster `cluster` with 1 replica, 1 core per replica and
+  // with topology `<topology>`.
+  "tf.TPUReplicateMetadata"() {_tpu_replicate = "cluster", num_relicas = 1, num_cores_per_replica = 1, topology = "<topology>", device_assignment = [], padding_map = []} : () -> ()
+  %replicated_input = "tf.TPUReplicatedInput"(%arg0) : (tensor<i32>) -> tensor<i32>
+  %identity = "tf.Identity"(%replicated_input) {_tpu_replicate = "cluster"} : (tensor<i32>) -> tensor<i32>
+  %replicated_output = "tf.TPUReplicatedOutput(%identity) : (tensor<i32>) -> tensor<i32>
+  return %replicated_output : tensor<i32>
+}
+```
+
+will be transformed into:
+
+```mlir
+func @tpu_computation(%arg0: tensor<i32>) -> tensor<i32> {
+  %cluster = "tf_device.cluster"() ( {
+    %identity = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
+    tf_device.return %identity : tensor<i32>
+  }) {_tpu_replicate = "cluster", num_cores_per_replica = 1, topology = "topology", device_assignment = [], padding_map = []} : () -> (tensor<i32>)
+  return %cluster : tensor<i32>
+}
+```
+
+The following replicated computation:
+
+```mlir
+func @tpu_computation(%arg0: tensor<i32>, %arg1: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
+  "tf.TPUReplicateMetadata"() {_tpu_replicate = "cluster", num_relicas = 2, num_cores_per_replica = 1, topology = "topology", device_assignment = [], padding_map = []} : () -> ()
+  %replicated_input = "tf.TPUReplicatedInput"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %identity = "tf.Identity"(%replicated_input) {_tpu_replicate = "cluster"} : (tensor<i32>) -> tensor<i32>
+  %replicated_output:2 = "tf.TPUReplicatedOutput(%identity) : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
+  return %replicated_output#0, %replicated_output#1 : tensor<i32>, tensor<i32>
+}
+```
+
+will be transformed into:
+
+```mlir
+func @tpu_computation(%arg0: tensor<i32>, %arg1: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
+  %replicate:2 = tf_device.replicate([%arg0, %arg1] as %replicated_input) {n = 2 : i32} {
+    %cluster = "tf_device.cluster"() ( {
+      %identity = "tf.Identity"(%replicated_input) : (tensor<i32>) -> tensor<i32>
+      tf_device.return %identity : tensor<i32>
+    }) {_tpu_replicate = "cluster", num_cores_per_replica = 1, topology = "topology", device_assignment = [], padding_map = []} : () -> (tensor<i32>)
+    tf_device.return %cluster : tensor<i32>
+  }
+  return %replicate#0, %replicate#1 : tensor<i32>, tensor<i32>
+}
+```