diff --git a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/BUILD b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/BUILD index ab02cfae96b..28ccf358dcf 100644 --- a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/BUILD +++ b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/BUILD @@ -27,6 +27,7 @@ cc_library( srcs = ["conv_emitter.cc"], hdrs = ["conv_emitter.h"], deps = [ + ":conv_emitter_transforms", "//tensorflow/compiler/xla:window_util", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service/llvm_ir:llvm_util", @@ -39,6 +40,23 @@ cc_library( ], ) +cc_library( + name = "conv_emitter_transforms", + srcs = ["conv_emitter_transforms.cc"], + hdrs = ["conv_emitter_transforms.h"], + deps = [ + "//tensorflow/core:lib", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/types:span", + "@llvm-project//llvm:support", + "@llvm-project//mlir:Affine", + "@llvm-project//mlir:IR", + "@llvm-project//mlir:StandardOps", + "@llvm-project//mlir:TransformUtils", + ], +) + tf_cc_test( name = "conv_emitter_test", srcs = ["conv_emitter_test.cc"], diff --git a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter.cc b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter.cc index 5ec8d3bb334..c17d686f7dc 100644 --- a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter.cc +++ b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter.cc @@ -38,6 +38,7 @@ limitations under the License. #include "mlir/Transforms/LoopUtils.h" // from @llvm-project #include "mlir/Transforms/RegionUtils.h" // from @llvm-project #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" +#include "tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_transforms.h" #include "tensorflow/compiler/xla/window_util.h" namespace xla { @@ -109,48 +110,6 @@ ShapeInfo GetShapeInfo( return shape_info; } -bool IsSimpleLoop(mlir::AffineForOp loop) { - return loop.getLowerBoundMap().isSingleConstant() && - loop.getLowerBoundMap().getSingleConstantResult() == 0 && - loop.getStep() == 1 && loop.getUpperBoundMap().getNumResults() == 1 && - std::next(loop.region().begin()) == loop.region().end(); -} - -struct BoundAffineMap { - mlir::AffineMap affine_map; - std::vector operands; -}; - -BoundAffineMap GetBoundAffineMapFrom(mlir::Operation* op) { - if (auto load = mlir::dyn_cast(op)) { - return {load.getAffineMap(), - std::vector(load.getMapOperands().begin(), - load.getMapOperands().end())}; - } else if (auto store = mlir::dyn_cast(op)) { - return {store.getAffineMap(), - std::vector(store.getMapOperands().begin(), - store.getMapOperands().end())}; - } else { - CHECK(false); - } -} - -mlir::Operation* CloneWithNewAffineMap(mlir::Operation* op, - BoundAffineMap new_affine, - mlir::OpBuilder builder) { - if (auto load = mlir::dyn_cast(op)) { - return builder.create( - builder.getUnknownLoc(), load.getMemRef(), new_affine.affine_map, - new_affine.operands); - } else if (auto store = mlir::dyn_cast(op)) { - return builder.create( - builder.getUnknownLoc(), store.getValueToStore(), store.getMemRef(), - new_affine.affine_map, new_affine.operands); - } else { - CHECK(false); - } -} - void SetMemRef(mlir::Operation* op, mlir::Value memref) { if (auto load = mlir::dyn_cast(op)) { load.setMemRef(memref); @@ -161,127 +120,6 @@ void SetMemRef(mlir::Operation* op, mlir::Value memref) { } } -std::vector CreateNestedSimpleLoops( - absl::Span upper_bounds, mlir::OpBuilder builder) { - std::vector loops; - loops.reserve(upper_bounds.size()); - for (int64_t dim : upper_bounds) { - auto loop = - builder.create(builder.getUnknownLoc(), 0, dim); - loops.push_back(loop); - builder = loop.getBodyBuilder(); - } - return loops; -} - -void SetBoundForSimpleLoop(mlir::AffineForOp loop, mlir::AffineExpr new_bound, - mlir::OpBuilder builder) { - CHECK(IsSimpleLoop(loop)); - - loop.setUpperBoundMap(mlir::AffineMap::get( - loop.getUpperBoundMap().getNumDims(), - loop.getUpperBoundMap().getNumSymbols(), {new_bound})); -} - -// Tile a loop with trip count N by `size`. For now, N has to be a multiple of -// size, but later this constraint will be removed. -// -// The major loop (with trip count N / size) stays as-is, while the minor loop -// (with trip count `size`) will take over the body of `target`, and be placed -// as the new body of `target`. -// -// `target` has to be within the same "perfectly nested loop group" as `loop`. -// See the documentation for mlir::getPerfectlyNestedLoops. -// -// Example: -// Before tiling `loop` with tile size X: -// for (loop in N) -// for (unrelated_loop in ...) -// for (target in ...) -// // pass loop into affine maps -// After: -// for (loop in N / X) -// for (unrelated_loop in ...) -// for (target in ...) -// for (tiled_loop in X) -// // rewrite all affine exprs from loop to `loop * X + tiled_loop`. -// -// Design note: -// TileLoop is different from mlir::tile. At the moment, mlir::tile is not well -// documented about the exact tiling semantics, but the observed behavior is: -// for (i from 0 to N) -// for (unrelated_loop in ...) -// for (target in ...) -// // pass i into affine maps -// => -// for (i from 0 to N, step = X) -// for (unrelated_loop in ...) -// for (target in ...) -// for (j from i to min(i + X, N), step = 1) -// // pass j into affine maps -// -// There are two differences between mlir::tile and TileLoop: -// * TileLoop always puts the tiling logic "stepping" logic into AffineExprs. -// With that all index calculation is done in AffineExprs and easier to -// analyze in a single place. -// * TileLoop doesn't plan to use use max() and min() to resolve the issue when -// N % X != 0. max() and min() are not representable in AffineExprs. -// TODO(timshen): support the case where N % X != 0. -// -// TODO(timshen): consider the possibility to reuse mlir::tile's logic to -// achieve the same goal. -mlir::AffineForOp TileLoop(mlir::AffineForOp loop, int64_t size, - mlir::AffineForOp target) { - CHECK(IsSimpleLoop(loop)); - CHECK(IsSimpleLoop(target)); - { - llvm::SmallVector all_loops; - getPerfectlyNestedLoops(all_loops, loop); - CHECK(absl::c_linear_search(all_loops, target)); - } - - auto builder = target.getBodyBuilder(); - - auto inner_loop = - builder.create(builder.getUnknownLoc(), 0, size); - { - auto& inner_operations = inner_loop.getBody()->getOperations(); - auto& target_operations = target.getBody()->getOperations(); - - inner_operations.splice(inner_operations.begin(), target_operations, - target_operations.begin(), - std::prev(target_operations.end(), 2)); - - mlir::AffineExpr length = loop.getUpperBoundMap().getResult(0); - CHECK_EQ(0, length.cast().getValue() % size); - SetBoundForSimpleLoop(loop, length.ceilDiv(size), builder); - } - - for (auto& use : - llvm::make_early_inc_range(loop.getInductionVar().getUses())) { - mlir::Operation* owner = use.getOwner(); - BoundAffineMap affine_map = GetBoundAffineMapFrom(owner); - unsigned new_dim = affine_map.operands.size(); - affine_map.operands.push_back(inner_loop.getInductionVar()); - std::vector replacements; - for (int i = 0; i < affine_map.affine_map.getNumDims(); i++) { - if (affine_map.operands[i] == loop.getInductionVar()) { - replacements.push_back(builder.getAffineDimExpr(i) * size + - builder.getAffineDimExpr(new_dim)); - } else { - replacements.push_back(builder.getAffineDimExpr(i)); - } - } - affine_map.affine_map = affine_map.affine_map.replaceDimsAndSymbols( - replacements, {}, affine_map.operands.size(), 0); - auto new_op = - CloneWithNewAffineMap(owner, affine_map, mlir::OpBuilder(owner)); - owner->replaceAllUsesWith(new_op); - owner->erase(); - } - return inner_loop; -} - // Hoist operations out of `where`. [begin_op, end_op) must be the first // operations of their parent loop, and `where` must be an ancestor of that // parent loop. @@ -387,21 +225,6 @@ mlir::Operation* HoistAndFix(mlir::Operation* op, mlir::AffineForOp where) { return HoistAndFix(op->getIterator(), std::next(op->getIterator()), where); } -// Sinks a segment of perfectly nested loops to the bottom. It implements this -// by rotating the loop nest by rotate_amount. -void SinkPerfectlyNestedLoops(absl::Span loops, - int rotate_amount) { - CHECK_GE(rotate_amount, 0); - std::vector permutation(loops.size()); - std::iota(permutation.begin(), permutation.end(), unsigned(0)); - std::rotate(permutation.begin(), - permutation.begin() + loops.size() - rotate_amount, - permutation.end()); - mlir::interchangeLoops( - llvm::ArrayRef(loops.begin(), loops.end()), - permutation); -} - struct InitialMlirConvAnchors { std::vector cartesian_product_loops; std::vector reduction_loops; diff --git a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_transforms.cc b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_transforms.cc new file mode 100644 index 00000000000..ec9e1c93f83 --- /dev/null +++ b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_transforms.cc @@ -0,0 +1,152 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_transforms.h" + +#include "absl/algorithm/container.h" +#include "llvm/ADT/StringRef.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" // from @llvm-project +#include "mlir/Transforms/LoopUtils.h" // from @llvm-project +#include "tensorflow/core/platform/logging.h" + +namespace xla { +namespace mlir_gpu { + +BoundAffineMap GetBoundAffineMapFrom(mlir::Operation* op) { + if (auto load = mlir::dyn_cast(op)) { + return {load.getAffineMap(), + std::vector(load.getMapOperands().begin(), + load.getMapOperands().end())}; + } else if (auto store = mlir::dyn_cast(op)) { + return {store.getAffineMap(), + std::vector(store.getMapOperands().begin(), + store.getMapOperands().end())}; + } else { + CHECK(false); + } +} + +mlir::Operation* CloneWithNewAffineMap(mlir::Operation* op, + BoundAffineMap new_affine, + mlir::OpBuilder builder) { + if (auto load = mlir::dyn_cast(op)) { + return builder.create( + builder.getUnknownLoc(), load.getMemRef(), new_affine.affine_map, + new_affine.operands); + } else if (auto store = mlir::dyn_cast(op)) { + return builder.create( + builder.getUnknownLoc(), store.getValueToStore(), store.getMemRef(), + new_affine.affine_map, new_affine.operands); + } else { + CHECK(false); + } +} + +bool IsSimpleLoop(mlir::AffineForOp loop) { + return loop.getLowerBoundMap().isSingleConstant() && + loop.getLowerBoundMap().getSingleConstantResult() == 0 && + loop.getStep() == 1 && loop.getUpperBoundMap().getNumResults() == 1 && + std::next(loop.region().begin()) == loop.region().end(); +} + +std::vector CreateNestedSimpleLoops( + absl::Span upper_bounds, mlir::OpBuilder builder) { + std::vector loops; + loops.reserve(upper_bounds.size()); + for (int64_t dim : upper_bounds) { + auto loop = + builder.create(builder.getUnknownLoc(), 0, dim); + loops.push_back(loop); + builder = loop.getBodyBuilder(); + } + return loops; +} + +void SetBoundForSimpleLoop(mlir::AffineForOp loop, mlir::AffineExpr new_bound, + mlir::OpBuilder builder) { + CHECK(IsSimpleLoop(loop)); + + loop.setUpperBoundMap(mlir::AffineMap::get( + loop.getUpperBoundMap().getNumDims(), + loop.getUpperBoundMap().getNumSymbols(), {new_bound})); +} + +mlir::AffineForOp TileLoop(mlir::AffineForOp loop, int64_t size, + mlir::AffineForOp target) { + CHECK(IsSimpleLoop(loop)); + CHECK(IsSimpleLoop(target)); + { + llvm::SmallVector all_loops; + getPerfectlyNestedLoops(all_loops, loop); + CHECK(absl::c_linear_search(all_loops, target)); + } + + auto builder = target.getBodyBuilder(); + + auto inner_loop = + builder.create(builder.getUnknownLoc(), 0, size); + { + auto& inner_operations = inner_loop.getBody()->getOperations(); + auto& target_operations = target.getBody()->getOperations(); + + inner_operations.splice(inner_operations.begin(), target_operations, + target_operations.begin(), + std::prev(target_operations.end(), 2)); + + mlir::AffineExpr length = loop.getUpperBoundMap().getResult(0); + CHECK_EQ(0, length.cast().getValue() % size); + SetBoundForSimpleLoop(loop, length.ceilDiv(size), builder); + } + + for (auto& use : + llvm::make_early_inc_range(loop.getInductionVar().getUses())) { + mlir::Operation* owner = use.getOwner(); + BoundAffineMap affine_map = GetBoundAffineMapFrom(owner); + unsigned new_dim = affine_map.operands.size(); + affine_map.operands.push_back(inner_loop.getInductionVar()); + std::vector replacements; + for (int i = 0; i < affine_map.affine_map.getNumDims(); i++) { + if (affine_map.operands[i] == loop.getInductionVar()) { + replacements.push_back(builder.getAffineDimExpr(i) * size + + builder.getAffineDimExpr(new_dim)); + } else { + replacements.push_back(builder.getAffineDimExpr(i)); + } + } + affine_map.affine_map = affine_map.affine_map.replaceDimsAndSymbols( + replacements, {}, affine_map.operands.size(), 0); + auto new_op = + CloneWithNewAffineMap(owner, affine_map, mlir::OpBuilder(owner)); + owner->replaceAllUsesWith(new_op); + owner->erase(); + } + return inner_loop; +} + +void SinkPerfectlyNestedLoops(absl::Span loops, + int rotate_amount) { + CHECK_GE(rotate_amount, 0); + std::vector permutation(loops.size()); + std::iota(permutation.begin(), permutation.end(), unsigned(0)); + std::rotate(permutation.begin(), + permutation.begin() + loops.size() - rotate_amount, + permutation.end()); + mlir::interchangeLoops( + llvm::ArrayRef(loops.begin(), loops.end()), + permutation); +} + +} // namespace mlir_gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_transforms.h b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_transforms.h new file mode 100644 index 00000000000..76348b376b2 --- /dev/null +++ b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_transforms.h @@ -0,0 +1,102 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_EXPERIMENTAL_CONV_EMITTER_CONV_EMITTER_TRANSFORMS_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_EXPERIMENTAL_CONV_EMITTER_CONV_EMITTER_TRANSFORMS_H_ + +#include "absl/base/integral_types.h" +#include "absl/types/span.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" // from @llvm-project +#include "mlir/Dialect/StandardOps/IR/Ops.h" // from @llvm-project +#include "mlir/IR/Operation.h" // from @llvm-project + +namespace xla { +namespace mlir_gpu { + +struct BoundAffineMap { + mlir::AffineMap affine_map; + std::vector operands; +}; + +BoundAffineMap GetBoundAffineMapFrom(mlir::Operation* op); +mlir::Operation* CloneWithNewAffineMap(mlir::Operation* op, + BoundAffineMap new_affine, + mlir::OpBuilder builder); + +bool IsSimpleLoop(mlir::AffineForOp loop); +std::vector CreateNestedSimpleLoops( + absl::Span upper_bounds, mlir::OpBuilder builder); +void SetBoundForSimpleLoop(mlir::AffineForOp loop, mlir::AffineExpr new_bound, + mlir::OpBuilder builder); + +// Tile a loop with trip count N by `size`. For now, N has to be a multiple of +// size, but later this constraint will be removed. +// +// The major loop (with trip count N / size) stays as-is, while the minor loop +// (with trip count `size`) will take over the body of `target`, and be placed +// as the new body of `target`. +// +// `target` has to be within the same "perfectly nested loop group" as `loop`. +// See the documentation for mlir::getPerfectlyNestedLoops. +// +// Example: +// Before tiling `loop` with tile size X: +// for (loop in N) +// for (unrelated_loop in ...) +// for (target in ...) +// // pass loop into affine maps +// After: +// for (loop in N / X) +// for (unrelated_loop in ...) +// for (target in ...) +// for (tiled_loop in X) +// // rewrite all affine exprs from loop to `loop * X + tiled_loop`. +// +// Design note: +// TileLoop is different from mlir::tile. At the moment, mlir::tile is not well +// documented about the exact tiling semantics, but the observed behavior is: +// for (i from 0 to N) +// for (unrelated_loop in ...) +// for (target in ...) +// // pass i into affine maps +// => +// for (i from 0 to N, step = X) +// for (unrelated_loop in ...) +// for (target in ...) +// for (j from i to min(i + X, N), step = 1) +// // pass j into affine maps +// +// There are two differences between mlir::tile and TileLoop: +// * TileLoop always puts the tiling logic "stepping" logic into AffineExprs. +// With that all index calculation is done in AffineExprs and easier to +// analyze in a single place. +// * TileLoop doesn't plan to use use max() and min() to resolve the issue when +// N % X != 0. max() and min() are not representable in AffineExprs. +// TODO(timshen): support the case where N % X != 0. +// +// TODO(timshen): consider the possibility to reuse mlir::tile's logic to +// achieve the same goal. +mlir::AffineForOp TileLoop(mlir::AffineForOp loop, int64_t size, + mlir::AffineForOp target); + +// Sinks a segment of perfectly nested loops to the bottom. It implements this +// by rotating the loop nest by rotate_amount. +void SinkPerfectlyNestedLoops(absl::Span loops, + int rotate_amount); + +} // namespace mlir_gpu +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_EXPERIMENTAL_CONV_EMITTER_CONV_EMITTER_TRANSFORMS_H_