Published the GPU delegates.

PiperOrigin-RevId: 240848313
2019-03-28 14:05:42 -07:00 · 2019-03-28 14:05:42 -07:00 · fb772b781b
commit fb772b781b
parent fd2db21368
261 changed files with 34683 additions and 1 deletions
--- a/tensorflow/lite/delegates/gpu/BUILD
+++ b/tensorflow/lite/delegates/gpu/BUILD
@ -0,0 +1,115 @@
 package(default_visibility = ["//visibility:public"])
 licenses(["notice"])  # Apache 2.0
 # Primary purpose of this config is to replace ::util::Status with our custom
 # light implementation ::tflite::gpu::StatusLite to reduce binary size.  Besides
 # that, certain features that were hard to communicate without full open source
 # were hidden away too such as compiled models, serialization, and metadata.
 # While the latter will be fully available with the open source release, the
 # former will have to stay until absl::Status is released.
 config_setting(
    name = "tflite_gpu_binary_release",
    values = {"copt": "-DTFLITE_GPU_BINARY_RELEASE"},
 )
 cc_library(
    name = "gl_delegate",
    srcs = ["gl_delegate.cc"],
    hdrs = ["gl_delegate.h"],
    linkopts = select({
        "//tensorflow:android": [
            "-lEGL",
            "-lGLESv3",
        ],
        "//conditions:default": [],
    }),
    deps = [
        "@com_google_absl//absl/types:span",
        "//tensorflow/lite:kernel_api",
        "//tensorflow/lite/c:c_api_internal",
        "//tensorflow/lite/delegates/gpu/common:convert",
        "//tensorflow/lite/delegates/gpu/common:model",
        "//tensorflow/lite/delegates/gpu/common:model_builder",
        "//tensorflow/lite/delegates/gpu/common:model_transformer",
        "//tensorflow/lite/delegates/gpu/common:shape",
        "//tensorflow/lite/delegates/gpu/common:status",
        "//tensorflow/lite/delegates/gpu/common:tensor",
        "//tensorflow/lite/delegates/gpu/common/transformations:general_transformations",
        "//tensorflow/lite/delegates/gpu/gl:api",
        "//tensorflow/lite/delegates/gpu/gl:command_queue",
        "//tensorflow/lite/delegates/gpu/gl:compiler",
        "//tensorflow/lite/delegates/gpu/gl:egl_environment",
        "//tensorflow/lite/delegates/gpu/gl:gl_call",
        "//tensorflow/lite/delegates/gpu/gl/converters:bhwc_to_phwc4",
        "//tensorflow/lite/delegates/gpu/gl/converters:phwc4_to_bhwc",
        "//tensorflow/lite/delegates/gpu/gl/kernels:registry",
        "//tensorflow/lite/delegates/gpu/gl/workgroups:best_effort_calculator",
    ] + select({
        "//conditions:default": [
            "//tensorflow/lite/delegates/gpu/gl:common_cc_fbs",
            "//tensorflow/lite/delegates/gpu/gl:metadata_cc_fbs",
            "//tensorflow/lite/delegates/gpu/gl:workgroups_cc_fbs",
            "@flatbuffers",
            "//tensorflow/lite/schema:schema_fbs",
        ],
        ":tflite_gpu_binary_release": [],
    }),
 )
 objc_library(
    name = "metal_delegate",
    srcs = ["metal_delegate.mm"],
    hdrs = ["metal_delegate.h"],
    copts = ["-std=c++11"],
    sdk_frameworks = ["Metal"],
    deps = [
        "//tensorflow/lite:kernel_api",
        "//tensorflow/lite/c:c_api_internal",
        "//tensorflow/lite/delegates/gpu/common:convert",
        "//tensorflow/lite/delegates/gpu/common:model",
        "//tensorflow/lite/delegates/gpu/common:model_builder",
        "//tensorflow/lite/delegates/gpu/common:model_transformer",
        "//tensorflow/lite/delegates/gpu/common:shape",
        "//tensorflow/lite/delegates/gpu/common:status",
        "//tensorflow/lite/delegates/gpu/common:tensor",
        "//tensorflow/lite/delegates/gpu/common:types",
        "//tensorflow/lite/delegates/gpu/common/transformations:general_transformations",
        "//tensorflow/lite/delegates/gpu/metal:api",
        "//tensorflow/lite/delegates/gpu/metal:buffer_convert",
        "//tensorflow/lite/delegates/gpu/metal:compiled_model",
        "//tensorflow/lite/delegates/gpu/metal:inference_context",
        "@com_google_absl//absl/types:span",
    ],
 )
 # build -c opt --config android_arm64 --copt -Os --copt -DTFLITE_GPU_BINARY_RELEASE --copt -fvisibility=hidden --linkopt -s --strip always :libtflite_gpu_gl.so
 cc_binary(
    name = "libtflite_gpu_gl.so",
    linkopts = select({
        "//tensorflow:android": [
            "-lEGL",
            "-lGLESv3",
        ],
        "//conditions:default": [],
    }),
    linkshared = 1,
    linkstatic = 1,
    tags = [
        "nobuilder",
        "notap",
    ],
    deps = [":gl_delegate"],
 )
 # build -c opt --config ios_arm64 --copt -Os --copt -DTFLITE_GPU_BINARY_RELEASE --copt -fvisibility=hidden --linkopt -s --strip always :libtflite_gpu_metal.so
 cc_binary(
    name = "libtflite_gpu_metal.so",
    linkshared = 1,
    linkstatic = 1,
    tags = [
        "nobuilder",
        "notap",
    ],
    deps = [":metal_delegate"],
 )
--- a/tensorflow/lite/delegates/gpu/README.md
+++ b/tensorflow/lite/delegates/gpu/README.md
@ -0,0 +1,205 @@
 # TFLite on GPU
 TensorFlow Lite (TFLite) supports several hardware accelerators.  This document
 describes how to use the GPU backend using the TFLite delegate APIs on Android
 and iOS.
 GPUs are designed to have high throughput for massively parallelizable
 workloads.  Thus, they are well-suited for deep neural nets which consists of a
 huge number of operators, each working on some input tensor(s) that can be
 easily divided into smaller workloads and carried out in parallel, typically
 resulting in lower latency.  In the best scenario, inference on the GPU may now
 run fast enough and now become suitable for real-time applications if it was not
 before.
 GPUs do their computation with 16-bit or 32-bit floating point numbers and do
 not require quantization for optimal performance unlike the CPUs.  If
 quantization of your neural network was not an option due to lower accuracy
 caused by lost precision, such concern can be discarded when running deep neural
 net models on the GPU.
 Another benefit that comes with GPU inference is its power efficiency.  GPUs
 carry out the computations in a very efficient and optimized way, so that they
 consume less power and generate less heat than when the same task is run on the
 CPUs.
 TFLite on GPU supports the following ops in 16-bit and 32-bit float precision:
 * `ADD v1`
 * `AVERAGE_POOL_2D v1`
 * `CONCATENATION v1`
 * `CONV_2D v1`
 * `DEPTHWISE_CONV_2D v1-2`
 * `FULLY_CONNECTED v1`
 * `LOGISTIC v1`
 * `LSTM v2 (Basic LSTM only)`
 * `MAX_POOL_2D v1`
 * `MUL v1`
 * `PAD v1`
 * `PRELU v1`
 * `RELU v1`
 * `RELU6 v1`
 * `RESHAPE v1`
 * `RESIZE_BILINEAR v1`
 * `SOFTMAX v1`
 * `STRIDED_SLICE v1`
 * `SUB v1`
 * `TRANSPOSE_CONV v1`
 ## Basic Usage
 Using TFLite on GPU is as simple as getting the GPU delegate via
 `TfLiteGpuDelegateCreate()` and then passing it to
 `Interpreter::ModifyGraphWithDelegate()` instead of calling
 `Interpreter::AllocateTensors()`:
 ```c++
 ////////
 // Set up interpreter.
 auto model = FlatBufferModel::BuildFromFile(model_path);
 ops::builtin::BuiltinOpResolver op_resolver;
 std::unique_ptr<Interpreter> interpreter;
 InterpreterBuilder(*model, op_resolver)(&interpreter);
 ////////
 // NEW: Prepare GPU delegate.
 auto* delegate = TfLiteGpuDelegateCreate(/*options=*/nullptr);
 if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return;
 ////////
 // Run inference.
 WriteToInputTensor(interpreter->typed_input_tensor<float>(0));
 if (interpreter->Invoke() != kTfLiteOk) return;
 ReadFromOutputTensor(interpreter->typed_output_tensor<float>(0));
 ////////
 // Clean up.
 TfLiteGpuDelegateDelete(delegate);
 ```
 *IMPORTANT:* When calling `Interpreter::ModifyGraphWithDelegate()` or
 `Interpreter::Invoke()`, the caller must have a `EGLContext` in the current
 thread and `Interpreter::Invoke()` must be called from the same `EGLContext`.
 If such `EGLContext` does not exist, the delegate will internally create one,
 but then the developer must ensure that `Interpreter::Invoke()` is always called
 from the same thread `Interpreter::ModifyGraphWithDelegate()` was called.
 ## Building and Runtime
 TFLite GPU backend uses OpenGL compute shaders and thus requires OpenGL ES 3.1
 or higher.
 ```sh
 bazel build --config android_arm64 //path/to/your:project
 ```
 Metal shaders are used for iOS, which were introduced with iOS 8.  Thus,
 compilation flags should look like:
 ```sh
 bazel build --config ios_arm64 //path/to/your:project
 ```
 ## Advanced Usage: Delegate Options
 There are GPU options that can be set and passed on to
 `TfLiteGpuDelegateCreate()`. When option is set to `nullptr` as shown in the
 Basic Usage, it translates to:
 ```c++
 const TfLiteGpuDelegateOptions kDefaultOptions = {
  .metadata = nullptr,
  .compile_options = {
    .precision_loss_allowed = 0,  // false
    .preferred_gl_object_type = TFLITE_GL_OBJECT_TYPE_FASTEST,
    .dynamic_batch_enabled = 0,  // false
  },
 };
 ```
 Similar for `NewTfLiteMetalDelgate()`:
 ```c++
 const TfLiteMetalDelegateOptions kDefaultOptions = {
  .precision_loss_allowed = 0,  // false
  .wait_type = TFLITE_METAL_WAIT_TYPE_SLEEP,
 };
 ```
 While it is convenient to just supply `nullptr`, it is recommended to explicitly
 set the options to avoid any unexpected artifacts in case default values are
 changed.
 ## Advanced Usage: Input/Output Buffers (C++)
 To do computation on the GPU, data must be made available to the GPU which often
 translates to performing a memory copy.  It is desirable not to cross the
 CPU/GPU memory boundary if possible, as this can take up a significant amount of
 time.  Usually, such crossing is inevitable, but in some special cases, one or
 the other can be omitted.
 If the network's input is an image already loaded in the GPU memory, e.g. a GPU
 texture containing the camera feed, it can stay in the GPU memory without ever
 entering the CPU memory.  Similarly, if the network's output is in the form of a
 renderable image, e.g.
 [image style transfer](https://www.cv-foundation.org/openaccess/content_cvpr_2016/papers/Gatys_Image_Style_Transfer_CVPR_2016_paper.pdf),
 it can be directly displayed on the screen.
 To let users achieve best performance, TFLite makes it possible for them to
 directly read from/write to the delegate's hardware buffer and bypass avoidable
 memory copies.
 Assuming the camera input is in the GPU memory as `GL_TEXTURE_2D`, it must be
 first converted to a shader storage buffer object (SSBO) for OpenGL or to a
 `MTLBuffer` object for Metal. One can associate a TfLiteTensor with a
 user-prepared SSBO or `MTLBuffer` with `TfLiteGpuDelegateBindBufferToTensor()`
 or `TfLiteMetalDelegateBindBufferToTensor()`, respectively.
 *IMPORTANT:* These must be called before
 `Interpreter::ModifyGraphWithDelegate()`.
 *IMPORTANT:* By default, the inference output is copied from GPU memory to CPU
 memory implicitly by the framework.  This behavior can be turned off by calling
 `Interpreter::SetAllowBufferHandleOutput(true)` during initialization.  To copy
 the inference output from GPU memory to CPU memory, explicit
 `Interpreter::EnsureTensorDataIsReadable()` calls are required for each output
 tensor.
 ```c++
 ////////
 // Prepare GPU delegate.
 auto* delegate = TfLiteGpuDelegateCreate(nullptr);
 interpreter->SetAllowBufferHandleOutput(true);  // disable default gpu->cpu copy
 #if defined(__ANDROID__)
 if (TfLiteGpuDelegateBindBufferToTensor(delegate, user_provided_input_buffer, interpreter->inputs()[0]) != kTfLiteOk) return;
 if (TfLiteGpuDelegateBindBufferToTensor(delegate, user_provided_output_buffer, interpreter->outputs()[0]) != kTfLiteOk) return;
 #elif defined(__APPLE__)
 if (TfLiteMetalDelegateBindBufferToTensor(delegate, user_provided_input_buffer, interpreter->inputs()[0]) != kTfLiteOk) return;
 if (TfLiteMetalDelegateBindBufferToTensor(delegate, user_provided_output_buffer, interpreter->outputs()[0]) != kTfLiteOk) return;
 #endif
 if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return;
 ////////
 // Run inference.
 if (interpreter->Invoke() != kTfLiteOk) return;
 ```
 ## Tips and Tricks
 * Some operations that are trivial on CPU side may be high cost in GPU land.
  One class of such operation is various forms of reshape operations (including
  `BATCH_TO_SPACE`, `SPACE_TO_BATCH`, `SPACE_TO_DEPTH`, etc.).  If those ops
  are inserted into the network just for the network architect's logical
  thinking, it is worth removing them for performance.
 * On GPU, tensor data is sliced into 4-channels.  Thus, a computation on a
  tensor of shape `[B, H, W, 5]` will perform about the same on a tensor of
  shape `[B, H, W, 8]`, but significantly worse than `[B, H, W, 4]`.
 * In that sense, if the camera hardware supports image frames in RGBA, feeding
  that 4-channel input is significantly faster as a memory copy (from 3-channel
  RGB to 4-channel RGBX) can be avoided.
 * For performance [best practices](https://www.tensorflow.org/lite/performance/best_practices), do not hesitate to re-train your classifier with
  mobile-optimized network architecture.  That is a significant part of
  optimization for on-device inference.
--- a/tensorflow/lite/delegates/gpu/common/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/BUILD
@ -0,0 +1,154 @@
 package(default_visibility = ["//visibility:public"])
 licenses(["notice"])  # Apache 2.0
 cc_library(
    name = "convert",
    srcs = ["convert.cc"],
    hdrs = ["convert.h"],
    deps = [
        "//tensorflow/lite/delegates/gpu/common:shape",
        "//tensorflow/lite/delegates/gpu/common:status",
        "//tensorflow/lite/delegates/gpu/common:tensor",
        "//tensorflow/lite/delegates/gpu/common:types",
        "//tensorflow/lite/delegates/gpu/common:util",
        "@FP16",
        "@com_google_absl//absl/strings",
        "@com_google_absl//absl/types:span",
    ],
 )
 cc_library(
    name = "data_type",
    srcs = ["data_type.cc"],
    hdrs = ["data_type.h"],
 )
 cc_library(
    name = "model",
    hdrs = ["model.h"],
    deps = [
        ":data_type",
        ":shape",
        ":status",
        ":tensor",
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/types:any",
    ],
 )
 cc_test(
    name = "model_test",
    srcs = ["model_test.cc"],
    deps = [
        ":model",
        "@com_google_googletest//:gtest_main",
    ],
 )
 cc_library(
    name = "model_builder",
    srcs = ["model_builder.cc"],
    hdrs = ["model_builder.h"],
    deps = [
        ":data_type",
        ":model",
        ":operations",
        ":shape",
        ":status",
        ":tensor",
        "//tensorflow/lite:context",
        "//tensorflow/lite:kernel_api",
        "//tensorflow/lite/kernels:kernel_util",
        "//tensorflow/lite/schema:schema_fbs",
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/strings",
        "@com_google_absl//absl/strings:str_format",
    ],
 )
 # TODO(impjdi): Add unit test for model_builder.
 cc_library(
    name = "model_transformer",
    srcs = ["model_transformer.cc"],
    hdrs = ["model_transformer.h"],
    deps = [
        "//tensorflow/lite/delegates/gpu/common:model",
        "@com_google_absl//absl/strings",
    ],
 )
 # TODO(impjdi): Add unit test for model_transformer.
 cc_library(
    name = "operations",
    srcs = ["operations.cc"],
    hdrs = ["operations.h"],
    deps = [
        ":data_type",
        ":model",
        ":shape",
        ":status",
        "@com_google_absl//absl/types:variant",
    ],
 )
 # TODO(impjdi): Add unit test for operations.
 cc_library(
    name = "shape",
    srcs = ["shape.cc"],
    hdrs = ["shape.h"],
    deps = [
        "@com_google_absl//absl/strings",
    ],
 )
 cc_test(
    name = "shape_test",
    srcs = ["shape_test.cc"],
    deps = [
        ":shape",
        "@com_google_googletest//:gtest_main",
    ],
 )
 cc_library(
    name = "status",
    hdrs = ["status.h"],
 )
 cc_library(
    name = "tensor",
    hdrs = ["tensor.h"],
    deps = [
        ":data_type",
        ":shape",
    ],
 )
 cc_library(
    name = "types",
    hdrs = ["types.h"],
    deps = [
        "@FP16",
    ],
 )
 cc_library(
    name = "util",
    hdrs = ["util.h"],
    deps = [
        "//tensorflow/lite/delegates/gpu/common:types",
    ],
 )
 cc_test(
    name = "util_test",
    srcs = ["util_test.cc"],
    deps = [
        ":util",
        "@com_google_googletest//:gtest_main",
    ],
 )
--- a/tensorflow/lite/delegates/gpu/common/convert.cc
+++ b/tensorflow/lite/delegates/gpu/common/convert.cc
@ -0,0 +1,506 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/common/convert.h"
 #include <fp16.h>
 #include "absl/strings/str_cat.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/common/util.h"
 namespace tflite {
 namespace gpu {
 namespace {
 constexpr int kPhwc4ChannelsInPlane = 4;
 constexpr int kPhwo4i4ChannelsInPlane = 4;
 constexpr int kPiohw4ChannelsInPlane = 4;
 }  // namespace
 uint32_t GetElementsSizeForPHWO4I4(const OHWI& shape) {
  return AlignByN(shape.i, kPhwo4i4ChannelsInPlane) *
         AlignByN(shape.o, kPhwo4i4ChannelsInPlane) * shape.h * shape.w;
 }
 uint32_t GetElementsSizeForPHWO4I4(const IHWO& shape) {
  return AlignByN(shape.i, kPhwo4i4ChannelsInPlane) *
         AlignByN(shape.o, kPhwo4i4ChannelsInPlane) * shape.h * shape.w;
 }
 // Layout is Po,H,W,OI4x4.
 Status ConvertToPHWO4I4(absl::Span<const float> in, const OHWI& shape,
                        absl::Span<float> out) {
  if (in.size() != shape.DimensionsProduct()) {
    return InvalidArgumentError(absl::StrCat(
        "ConvertToPHWO4I4: Input data size does not match expected size: ",
        in.size(), " != ", shape.DimensionsProduct()));
  }
  if (out.size() != GetElementsSizeForPHWO4I4(shape)) {
    return InvalidArgumentError(absl::StrCat(
        "ConvertToPHWO4I4: Output data size does not match expected size: ",
        out.size(), " != ", GetElementsSizeForPHWO4I4(shape)));
  }
  float* output = out.data();
  for (int p = 0; p < IntegralDivideRoundUp(shape.o, kPhwo4i4ChannelsInPlane);
       ++p) {
    for (int h = 0; h < shape.h; ++h) {
      for (int w = 0; w < shape.w; ++w) {
        for (int c = 0;
             c < IntegralDivideRoundUp(shape.i, kPhwo4i4ChannelsInPlane); ++c) {
          for (int co = 0; co < kPhwo4i4ChannelsInPlane; ++co) {
            for (int ci = 0; ci < kPhwo4i4ChannelsInPlane; ++ci) {
              float value = 0;
              if (c * kPhwo4i4ChannelsInPlane + ci < shape.i &&
                  p * kPhwo4i4ChannelsInPlane + co < shape.o) {
                // tensor is in OHWI
                int tensor_o = p * kPhwo4i4ChannelsInPlane + co;
                int tensor_i = c * kPhwo4i4ChannelsInPlane + ci;
                value = in[shape.LinearIndex({tensor_o, h, w, tensor_i})];
              }
              (*output++) = value;
            }
          }
        }
      }
    }
  }
  return OkStatus();
 }
 std::vector<float> ConvertToPHWO4I4(
    const Tensor<OHWI, DataType::FLOAT32>& tensor) {
  std::vector<float> transposed(GetElementsSizeForPHWO4I4(tensor.shape));
  ConvertToPHWO4I4(tensor.data, tensor.shape,
                   absl::MakeSpan(transposed.data(), transposed.size()))
      .IgnoreError();
  return transposed;
 }
 uint3 Get3DSizeForPHWO4I4(const OHWI& shape) {
  return uint3(AlignByN(shape.i, 4), shape.h * shape.w,
               IntegralDivideRoundUp(shape.o, 4));
 }
 // Layout is Po,H,W,OI4x4.
 Status ConvertToPHWO4I4(absl::Span<const float> in, const IHWO& shape,
                        absl::Span<float> out) {
  if (in.size() != shape.DimensionsProduct()) {
    return InvalidArgumentError(absl::StrCat(
        "ConvertToPHWO4I4: Input data size does not match expected size: ",
        in.size(), " != ", shape.DimensionsProduct()));
  }
  if (out.size() != GetElementsSizeForPHWO4I4(shape)) {
    return InvalidArgumentError(absl::StrCat(
        "ConvertToPHWO4I4: Output data size does not match expected size: ",
        out.size(), " != ", GetElementsSizeForPHWO4I4(shape)));
  }
  const int dst_depth = IntegralDivideRoundUp(shape.o, 4);
  const int src_depth = IntegralDivideRoundUp(shape.i, 4);
  float* output = out.data();
  for (int f = 0; f < dst_depth; ++f) {
    for (int y = 0; y < shape.h; ++y) {
      for (int x = 0; x < shape.w; ++x) {
        for (int ch = 0; ch < src_depth; ++ch) {
          for (int co = 0; co < 4; ++co) {
            for (int ci = 0; ci < 4; ++ci) {
              const int src_channel = ch * 4 + ci;
              const int dst_channel = f * 4 + co;
              float value = 0;
              if (src_channel < shape.i && dst_channel < shape.o) {
                // tensor is in IHWO
                value = in[shape.LinearIndex({src_channel, y, x, dst_channel})];
              }
              (*output++) = value;
            }
          }
        }
      }
    }
  }
  return OkStatus();
 }
 std::vector<float> ConvertToPHWO4I4(
    const Tensor<IHWO, DataType::FLOAT32>& tensor) {
  std::vector<float> transposed(GetElementsSizeForPHWO4I4(tensor.shape));
  ConvertToPHWO4I4(tensor.data, tensor.shape,
                   absl::MakeSpan(transposed.data(), transposed.size()))
      .IgnoreError();
  return transposed;
 }
 uint32_t GetElementsSizeForPIOHW4(const OHWI& shape) {
  return AlignByN(shape.o * shape.i, kPiohw4ChannelsInPlane) * shape.h *
         shape.w;
 }
 Status ConvertToPIOHW4(absl::Span<const float> in, const OHWI& shape,
                       absl::Span<float> out) {
  if (in.size() != shape.DimensionsProduct()) {
    return InvalidArgumentError(absl::StrCat(
        "ConvertToPIOHW4: Input data size does not match expected size: ",
        in.size(), " != ", shape.DimensionsProduct()));
  }
  if (out.size() != GetElementsSizeForPIOHW4(shape)) {
    return InvalidArgumentError(absl::StrCat(
        "ConvertToPIOHW4: Output data size does not match expected size: ",
        out.size(), " != ", GetElementsSizeForPIOHW4(shape)));
  }
  int32_t output_channels = shape.o * shape.i;
  int32_t num_planes =
      IntegralDivideRoundUp(output_channels, kPiohw4ChannelsInPlane);
  float* output = out.data();
  for (int p = 0; p < num_planes; ++p) {
    for (int h = 0; h < shape.h; ++h) {
      for (int w = 0; w < shape.w; ++w) {
        for (int c = 0; c < kPiohw4ChannelsInPlane; ++c) {
          int output_c = p * kPiohw4ChannelsInPlane + c;
          (*output++) = output_c >= output_channels
                            ? 0
                            : in[shape.LinearIndex({output_c % shape.o, h, w,
                                                    output_c / shape.o})];
        }
      }
    }
  }
  return OkStatus();
 }
 std::vector<float> ConvertToPIOHW4(
    const Tensor<OHWI, DataType::FLOAT32>& tensor) {
  std::vector<float> transposed(GetElementsSizeForPIOHW4(tensor.shape));
  ConvertToPIOHW4(tensor.data, tensor.shape,
                  absl::MakeSpan(transposed.data(), transposed.size()))
      .IgnoreError();
  return transposed;
 }
 template <typename T>
 Status ValidateConvertToPHWC4(absl::Span<const float> in, const BHWC& shape,
                              absl::Span<T> out) {
  if (in.size() != shape.DimensionsProduct()) {
    return InvalidArgumentError(absl::StrCat(
        "ConvertToPHWC4: Input data size does not match expected size: ",
        in.size(), " != ", shape.DimensionsProduct()));
  }
  if (out.size() != GetElementsSizeForPHWC4(shape)) {
    return InvalidArgumentError(absl::StrCat(
        "ConvertToPHWC4: Output data size does not match expected size: ",
        out.size(), " != ", GetElementsSizeForPHWC4(shape)));
  }
  return OkStatus();
 }
 // Layout is Pc,H,W,C4 where P - is a plane based on channels.
 Status ConvertToPHWC4(absl::Span<const float> in, const BHWC& shape,
                      absl::Span<float> out) {
  RETURN_IF_ERROR(ValidateConvertToPHWC4(in, shape, out));
  if (shape.c == 4) {
    std::memcpy(out.data(), in.data(),
                shape.DimensionsProduct() * sizeof(float));
    return OkStatus();
  }
  // Layout is Pc,H,W,C4 where P - is a plane based on channels.
  int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
  const int num_pixels = shape.h * shape.w;
  // A layer is a set of kPhwc4ChannelsInPlane channels images.
  const int num_full_planes = shape.c / kPhwc4ChannelsInPlane;
  for (int b = 0; b < shape.b; b++) {
    float* dest =
        out.data() + b * num_pixels * num_planes * kPhwc4ChannelsInPlane;
    for (int p = 0; p < num_full_planes; p++) {
      const float* src =
          in.data() + shape.LinearIndex({b, 0, 0, p * kPhwc4ChannelsInPlane});
      for (int i = 0; i < num_pixels; i++) {
        std::memcpy(dest, src, kPhwc4ChannelsInPlane * sizeof(float));
        src += shape.c;
        dest += kPhwc4ChannelsInPlane;
      }
    }
  }
  // Padding last kPhwc4ChannelsInPlane-channel layer to multiple of
  // kPhwc4ChannelsInPlane.
  const int padded_size = num_pixels * num_planes * kPhwc4ChannelsInPlane;
  const int remaining_channels =
      shape.c - num_full_planes * kPhwc4ChannelsInPlane;
  if (remaining_channels == 0) {
    return OkStatus();
  }
  for (int b = 0; b < shape.b; b++) {
    const float* src =
        in.data() +
        shape.LinearIndex({b, 0, 0, num_full_planes * kPhwc4ChannelsInPlane});
    float* dest = out.data() + b * padded_size +
                  num_pixels * num_full_planes * kPhwc4ChannelsInPlane;
    for (int p = 0; p < num_pixels; p++) {
      std::memcpy(dest, src, remaining_channels * sizeof(float));
      std::memset(dest + remaining_channels, 0,
                  (4 - remaining_channels) * sizeof(float));
      src += shape.c;
      dest += kPhwc4ChannelsInPlane;
    }
  }
  return OkStatus();
 }
 // Layout is Pc,H,W,C4 where P - is a plane based on channels.
 Status ConvertToPHWC4Half(absl::Span<const float> in, const BHWC& shape,
                          absl::Span<HalfBits> out) {
  RETURN_IF_ERROR(ValidateConvertToPHWC4(in, shape, out));
  // Layout is Pc,H,W,C4 where P - is a plane based on channels.
  int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
  const int num_pixels = shape.h * shape.w;
  // A layer is a set of kPhwc4ChannelsInPlane channels images.
  const int num_full_planes = shape.c / kPhwc4ChannelsInPlane;
  for (int b = 0; b < shape.b; b++) {
    HalfBits* dest =
        out.data() + b * num_pixels * num_planes * kPhwc4ChannelsInPlane;
    for (int p = 0; p < num_full_planes; p++) {
      const float* src =
          in.data() + shape.LinearIndex({b, 0, 0, p * kPhwc4ChannelsInPlane});
      for (int i = 0; i < num_pixels; i++) {
        dest[0] = fp16_ieee_from_fp32_value(src[0]);
        dest[1] = fp16_ieee_from_fp32_value(src[1]);
        dest[2] = fp16_ieee_from_fp32_value(src[2]);
        dest[3] = fp16_ieee_from_fp32_value(src[3]);
        src += shape.c;
        dest += kPhwc4ChannelsInPlane;
      }
    }
  }
  // Padding last kPhwc4ChannelsInPlane-channel layer to multiple of
  // kPhwc4ChannelsInPlane.
  const int padded_size = num_pixels * num_planes * kPhwc4ChannelsInPlane;
  const int remaining_channels =
      shape.c - num_full_planes * kPhwc4ChannelsInPlane;
  if (remaining_channels == 0) {
    return OkStatus();
  }
  for (int b = 0; b < shape.b; b++) {
    const float* src =
        in.data() +
        shape.LinearIndex({b, 0, 0, num_full_planes * kPhwc4ChannelsInPlane});
    HalfBits* dest = out.data() + b * padded_size +
                     num_pixels * num_full_planes * kPhwc4ChannelsInPlane;
    switch (remaining_channels) {
      case 1:
        for (int p = 0; p < num_pixels; p++) {
          dest[0] = fp16_ieee_from_fp32_value(src[0]);
          dest[1] = 0;
          dest[2] = 0;
          dest[3] = 0;
          src += shape.c;
          dest += kPhwc4ChannelsInPlane;
        }
        break;
      case 2:
        for (int p = 0; p < num_pixels; p++) {
          dest[0] = fp16_ieee_from_fp32_value(src[0]);
          dest[1] = fp16_ieee_from_fp32_value(src[1]);
          dest[2] = 0;
          dest[3] = 0;
          src += shape.c;
          dest += kPhwc4ChannelsInPlane;
        }
        break;
      case 3:
        for (int p = 0; p < num_pixels; p++) {
          dest[0] = fp16_ieee_from_fp32_value(src[0]);
          dest[1] = fp16_ieee_from_fp32_value(src[1]);
          dest[2] = fp16_ieee_from_fp32_value(src[2]);
          dest[3] = 0;
          src += shape.c;
          dest += kPhwc4ChannelsInPlane;
        }
        break;
      default:
        return UnimplementedError(
            "ConvertToPHWC4Half: Unsupported channels per planes count.");
    }
  }
  return OkStatus();
 }
 std::vector<float> ConvertToPHWC4(
    const Tensor<BHWC, DataType::FLOAT32>& tensor) {
  std::vector<float> transposed(GetElementsSizeForPHWC4(tensor.shape));
  ConvertToPHWC4(tensor.data, tensor.shape,
                 absl::MakeSpan(transposed.data(), transposed.size()))
      .IgnoreError();
  // TODO(akulik): Maybe safer to return Status.
  return transposed;
 }
 std::vector<float> ConvertToPHWC4(
    const Tensor<HWC, DataType::FLOAT32>& tensor) {
  const BHWC batched_shape =
      BHWC(1, tensor.shape.h, tensor.shape.w, tensor.shape.c);
  std::vector<float> transposed(GetElementsSizeForPHWC4(batched_shape));
  ConvertToPHWC4(tensor.data, batched_shape,
                 absl::MakeSpan(transposed.data(), transposed.size()))
      .IgnoreError();
  // TODO(akulik): Maybe safer to return Status.
  return transposed;
 }
 uint32_t GetElementsSizeForPHWC4(const BHWC& shape) {
  return shape.b * shape.h * shape.w * AlignByN(shape.c, kPhwc4ChannelsInPlane);
 }
 template <typename T>
 Status ValidateConvertFromPHWC4(absl::Span<const T> in, const BHWC& shape,
                                absl::Span<float> out) {
  if (in.size() != GetElementsSizeForPHWC4(shape)) {
    return InvalidArgumentError(absl::StrCat(
        "ConvertFromPHWC4: Input data size does not match expected size: ",
        in.size(), " != ", GetElementsSizeForPHWC4(shape)));
  }
  if (out.size() != shape.DimensionsProduct()) {
    return InvalidArgumentError(absl::StrCat(
        "ConvertFromPHWC4: Output data size does not match expected size: ",
        out.size(), " != ", shape.DimensionsProduct()));
  }
  return OkStatus();
 }
 Status ConvertFromPHWC4(absl::Span<const float> in, const BHWC& shape,
                        absl::Span<float> out) {
  RETURN_IF_ERROR(ValidateConvertFromPHWC4(in, shape, out));
  if (shape.c == 4) {
    std::memcpy(out.data(), in.data(),
                shape.DimensionsProduct() * sizeof(float));
    return OkStatus();
  }
  int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
  const int num_pixels = shape.h * shape.w;
  const int padded_size = num_pixels * num_planes * kPhwc4ChannelsInPlane;
  // A layer is a set of kPhwc4ChannelsInPlane channels images.
  const int num_full_planes = shape.c / kPhwc4ChannelsInPlane;
  for (int b = 0; b < shape.b; b++) {
    const float* src = in.data() + b * padded_size;
    for (int p = 0; p < num_full_planes; p++) {
      float* dest =
          out.data() + shape.LinearIndex({b, 0, 0, p * kPhwc4ChannelsInPlane});
      for (int i = 0; i < num_pixels; i++) {
        std::memcpy(dest, src, kPhwc4ChannelsInPlane * sizeof(float));
        src += kPhwc4ChannelsInPlane;
        dest += shape.c;
      }
    }
  }
  // Unpadding last kPhwc4ChannelsInPlane-channel plane
  const int remaining_channels =
      shape.c - num_full_planes * kPhwc4ChannelsInPlane;
  if (remaining_channels == 0) {
    return OkStatus();
  }
  for (int b = 0; b < shape.b; b++) {
    const float* src = in.data() + b * padded_size +
                       num_pixels * num_full_planes * kPhwc4ChannelsInPlane;
    float* dest =
        out.data() +
        shape.LinearIndex({b, 0, 0, num_full_planes * kPhwc4ChannelsInPlane});
    for (int p = 0; p < num_pixels; p++) {
      std::memcpy(dest, src, remaining_channels * sizeof(float));
      src += kPhwc4ChannelsInPlane;
      dest += shape.c;
    }
  }
  return OkStatus();
 }
 Status ConvertFromPHWC4Half(absl::Span<const HalfBits> in, const BHWC& shape,
                            absl::Span<float> out) {
  RETURN_IF_ERROR(ValidateConvertFromPHWC4(in, shape, out));
  int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
  const int num_pixels = shape.h * shape.w;
  const int padded_size = num_pixels * num_planes * kPhwc4ChannelsInPlane;
  // A layer is a set of kPhwc4ChannelsInPlane channels images.
  const int num_full_planes = shape.c / kPhwc4ChannelsInPlane;
  for (int b = 0; b < shape.b; b++) {
    const HalfBits* src = in.data() + b * padded_size;
    for (int p = 0; p < num_full_planes; p++) {
      float* dest =
          out.data() + shape.LinearIndex({b, 0, 0, p * kPhwc4ChannelsInPlane});
      for (int i = 0; i < num_pixels; i++) {
        dest[0] = fp16_ieee_to_fp32_value(src[0]);
        dest[1] = fp16_ieee_to_fp32_value(src[1]);
        dest[2] = fp16_ieee_to_fp32_value(src[2]);
        dest[3] = fp16_ieee_to_fp32_value(src[3]);
        src += kPhwc4ChannelsInPlane;
        dest += shape.c;
      }
    }
  }
  // Unpadding last kPhwc4ChannelsInPlane-channel plane
  const int remaining_channels =
      shape.c - num_full_planes * kPhwc4ChannelsInPlane;
  if (remaining_channels == 0) {
    return OkStatus();
  }
  for (int b = 0; b < shape.b; b++) {
    const HalfBits* src = in.data() + b * padded_size +
                          num_pixels * num_full_planes * kPhwc4ChannelsInPlane;
    float* dest =
        out.data() +
        shape.LinearIndex({b, 0, 0, num_full_planes * kPhwc4ChannelsInPlane});
    switch (remaining_channels) {
      case 1:
        for (int p = 0; p < num_pixels; p++) {
          dest[0] = fp16_ieee_to_fp32_value(src[0]);
          src += kPhwc4ChannelsInPlane;
          dest += shape.c;
        }
        break;
      case 2:
        for (int p = 0; p < num_pixels; p++) {
          dest[0] = fp16_ieee_to_fp32_value(src[0]);
          dest[1] = fp16_ieee_to_fp32_value(src[1]);
          src += kPhwc4ChannelsInPlane;
          dest += shape.c;
        }
        break;
      case 3:
        for (int p = 0; p < num_pixels; p++) {
          dest[0] = fp16_ieee_to_fp32_value(src[0]);
          dest[1] = fp16_ieee_to_fp32_value(src[1]);
          dest[2] = fp16_ieee_to_fp32_value(src[2]);
          src += kPhwc4ChannelsInPlane;
          dest += shape.c;
        }
        break;
      default:
        return UnimplementedError(
            "ConvertToPHWC4Half: Unsupported channels per planes count.");
    }
  }
  return OkStatus();
 }
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/common/convert.h
+++ b/tensorflow/lite/delegates/gpu/common/convert.h
@ -0,0 +1,97 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_CONVERT_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_CONVERT_H_
 #include <vector>
 #include "absl/types/span.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 namespace tflite {
 namespace gpu {
 // PHWC4 layout is where channels are grouped by 4 in a row and P stands for
 // a plane that was derived by dividing channels by 4.
 ::tflite::gpu::Status ConvertToPHWC4(absl::Span<const float> in,
                                     const BHWC& shape, absl::Span<float> out);
 ::tflite::gpu::Status ConvertToPHWC4Half(
    absl::Span<const float> in, const BHWC& shape,
    absl::Span<::tflite::gpu::HalfBits> out);
 // @return number of elements when shape is converted into PHWC4.
 uint32_t GetElementsSizeForPHWC4(const BHWC& shape);
 // Operation is opposite to ConvertToPHWC4.
 ::tflite::gpu::Status ConvertFromPHWC4(absl::Span<const float> in,
                                       const BHWC& shape,
                                       absl::Span<float> out);
 ::tflite::gpu::Status ConvertFromPHWC4Half(
    absl::Span<const ::tflite::gpu::HalfBits> in, const BHWC& shape,
    absl::Span<float> out);
 // Convenience wrapper around a method above.
 std::vector<float> ConvertToPHWC4(
    const Tensor<BHWC, DataType::FLOAT32>& tensor);
 std::vector<float> ConvertToPHWC4(const Tensor<HWC, DataType::FLOAT32>& tensor);
 // @return number of elements when shape is converted into PIOHW4.
 uint32_t GetElementsSizeForPIOHW4(const OHWI& shape);
 // PIOHW4 layout re-arranges weights in groups by 4, where outer dimension is
 // P which is OxI/4.
 ::tflite::gpu::Status ConvertToPIOHW4(absl::Span<const float> in,
                                      const OHWI& shape, absl::Span<float> out);
 // Convenience wrapper around a method above.
 std::vector<float> ConvertToPIOHW4(
    const Tensor<OHWI, DataType::FLOAT32>& tensor);
 // @return number of elements when shape is converted into PHWO4I4.
 uint32_t GetElementsSizeForPHWO4I4(const OHWI& shape);
 // Layout is Po,H,W,OI4x4.
 ::tflite::gpu::Status ConvertToPHWO4I4(absl::Span<const float> in,
                                       const OHWI& shape,
                                       absl::Span<float> out);
 // Convenience wrapper around a method above.
 std::vector<float> ConvertToPHWO4I4(
    const Tensor<OHWI, DataType::FLOAT32>& tensor);
 // @return (x,y,z) size for PHWO4I4 to access elements where each element
 // consists of 4 values.
 ::tflite::gpu::uint3 Get3DSizeForPHWO4I4(const OHWI& shape);
 // @return number of elements when shape is converted into PHWO4I4.
 uint32_t GetElementsSizeForPHWO4I4(const IHWO& shape);
 // Layout is Po,H,W,OI4x4.
 ::tflite::gpu::Status ConvertToPHWO4I4(absl::Span<const float> in,
                                       const IHWO& shape,
                                       absl::Span<float> out);
 // Convenience wrapper around a method above.
 std::vector<float> ConvertToPHWO4I4(
    const Tensor<IHWO, DataType::FLOAT32>& tensor);
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_CONVERT_H_
--- a/tensorflow/lite/delegates/gpu/common/data_type.cc
+++ b/tensorflow/lite/delegates/gpu/common/data_type.cc
@ -0,0 +1,78 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include <stddef.h>
 #include <string>
 namespace tflite {
 namespace gpu {
 size_t SizeOf(DataType data_type) {
  switch (data_type) {
    case DataType::UINT8:
    case DataType::INT8:
      return 1;
    case DataType::FLOAT16:
    case DataType::INT16:
    case DataType::UINT16:
      return 2;
    case DataType::FLOAT32:
    case DataType::INT32:
    case DataType::UINT32:
      return 4;
    case DataType::FLOAT64:
    case DataType::INT64:
    case DataType::UINT64:
      return 8;
    case DataType::UNKNOWN:
      return 0;
  }
  return 0;
 }
 std::string ToString(DataType data_type) {
  switch (data_type) {
    case DataType::FLOAT16:
      return "float16";
    case DataType::FLOAT32:
      return "float32";
    case DataType::FLOAT64:
      return "float64";
    case DataType::INT16:
      return "int16";
    case DataType::INT32:
      return "int32";
    case DataType::INT64:
      return "int64";
    case DataType::INT8:
      return "int8";
    case DataType::UINT16:
      return "uint16";
    case DataType::UINT32:
      return "uint32";
    case DataType::UINT64:
      return "uint64";
    case DataType::UINT8:
      return "uint8";
    case DataType::UNKNOWN:
      return "unknown";
  }
  return "undefined";
 }
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/common/data_type.h
+++ b/tensorflow/lite/delegates/gpu/common/data_type.h
@ -0,0 +1,47 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_DATA_TYPE_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_DATA_TYPE_H_
 #include <stddef.h>
 #include <string>
 namespace tflite {
 namespace gpu {
 enum class DataType {
  UNKNOWN = 0,
  FLOAT16 = 1,
  FLOAT32 = 2,
  FLOAT64 = 3,
  UINT8 = 4,
  INT8 = 5,
  UINT16 = 6,
  INT16 = 7,
  UINT32 = 8,
  INT32 = 9,
  UINT64 = 10,
  INT64 = 11,
 };
 size_t SizeOf(DataType type);
 std::string ToString(DataType t);
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_DATA_TYPE_H_
--- a/tensorflow/lite/delegates/gpu/common/model.h
+++ b/tensorflow/lite/delegates/gpu/common/model.h
@ -0,0 +1,547 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_H_
 #include <algorithm>
 #include <cstdint>
 #include <memory>
 #include <string>
 #include <vector>
 #include "absl/memory/memory.h"
 #include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
 namespace tflite {
 namespace gpu {
 // There is yet another representation of CNN graph. The primary purpose of this
 // representation is to simplify graph manipulation.
 using ValueId = uint32_t;
 using NodeId = uint32_t;
 // Connects tensor's producer and operation that depends on this tensor.
 template <typename TensorT>
 struct Value {
  using TensorType = TensorT;
  const ValueId id;
  TensorType tensor;
 };
 struct Operation {
  std::string type;
  absl::any attributes;
 };
 struct Node {
  const NodeId id;
  Operation operation;
 };
 // Graph is DAG that consists of nodes and values. Each value may have a single
 // producer node and multiple consumer nodes. Therefore, each node may have
 // multiple input and output values.
 //
 // Value that does not have a producer is a graph's input. Value that does not
 // have a consumer is a graph's output.
 //
 // Interface provides methods for graph introspection and manipulation. Abstract
 // interface makes allows subgraphs representation to ensure safe manipulations.
 template <typename TensorT>
 class Graph {
 public:
  virtual ~Graph() = default;
  // @return a collection of nodes in this graph.
  virtual std::vector<Node*> nodes() const = 0;
  // @return a collection of values in this graph.
  virtual std::vector<Value<TensorT>*> values() const = 0;
  // @return graph inputs, that are values without producers.
  virtual std::vector<Value<TensorT>*> inputs() const = 0;
  // @return graph outputs, that are values without consumers.
  virtual std::vector<Value<TensorT>*> outputs() const = 0;
  // @return inputs into the given node. Returns empty vector for deleted node.
  virtual std::vector<Value<TensorT>*> FindInputs(NodeId id) const = 0;
  // @return outputs from the given node. Returns empty vector for deleted node.
  virtual std::vector<Value<TensorT>*> FindOutputs(NodeId id) const = 0;
  virtual bool IsGraphInput(ValueId id) const = 0;
  virtual bool IsGraphOutput(ValueId id) const = 0;
  // @return producer of the given value. Returns nullptr for deleted value.
  virtual Node* FindProducer(ValueId id) const = 0;
  // @return consumers of the given value. Returns empty vector for deleted
  // value.
  virtual std::vector<Node*> FindConsumers(ValueId id) const = 0;
  // @return a node or nullptr if node with the given id is not present.
  virtual Node* GetNode(NodeId id) const = 0;
  // @return a value or nullptr if value with the given id is not present.
  virtual Value<TensorT>* GetValue(ValueId id) const = 0;
  //////////////////////////////////////////////////////////////////////////////
  // Graph manipulation functions are below
  //////////////////////////////////////////////////////////////////////////////
  // @return new node created in this graph
  // NOTE: nodes should be created in the topological order, e.g. node A that
  // depends on a value from node B should be created after node B.
  virtual Node* NewNode() = 0;
  // @return new value created in this graph
  virtual Value<TensorT>* NewValue() = 0;
  // Sets a producer for the given value. There could be a single producer
  // for a value. If a value had another producer, it will reassign producer
  // appropriately. If a value didn't have a producer, it will be removed
  // from a graph's input.
  virtual Status SetProducer(NodeId producer, ValueId value) = 0;
  // Removes a producer for the given value. Value becomes producer-less and
  // therefore becomes graph's input.
  virtual Status RemoveProducer(ValueId value) = 0;
  // Sets a consumer for the given value. There could be multiple consumers
  // for a value.
  virtual Status AddConsumer(NodeId consumer, ValueId value) = 0;
  // Removes a consumer for the given value. If value does not have any
  // consumers it becomes graph's output.
  virtual Status RemoveConsumer(NodeId consumer, ValueId value) = 0;
  // Removes node from this graph. For all input values this node will be
  // removed from consumers and for all output values a producer will be
  // removed.
  virtual Status DeleteNode(NodeId id) = 0;
  // Removes value from this graph. It will be removed from inputs for all
  // dependent nodes. A node that was a producer of this value will loose its
  // output.
  virtual Status DeleteValue(ValueId id) = 0;
 };
 // Implementation of a Graph interface. It keeps values and nodes referenced by
 // their index in a vector. Therefore, nodes and values are never deleted, but
 // rather erased, where corresponding index remains.
 //
 // It is possible to re-use removed indices, but it is not implemented yet.
 template <typename TensorT>
 class Model : public Graph<TensorT> {
 public:
  const std::string& name() const { return name_; }
  void set_name(std::string name) { name_ = std::move(name); }
  std::vector<Value<TensorT>*> values() const final {
    return FilterValues([](const ValueDef&) { return true; });
  }
  std::vector<Node*> nodes() const final {
    return FilterNodes([](const NodeDef&) { return true; });
  }
  std::vector<Value<TensorT>*> inputs() const final {
    return FilterValues(
        [](const ValueDef& v) { return v.producer == nullptr; });
  }
  std::vector<Value<TensorT>*> outputs() const final {
    return FilterValues([](const ValueDef& v) { return v.consumers.empty(); });
  }
  bool IsGraphInput(ValueId id) const final {
    if (id >= values_.size()) {
      return false;
    }
    return values_[id].producer == nullptr;
  }
  bool IsGraphOutput(ValueId id) const final {
    if (id >= values_.size()) {
      return false;
    }
    return values_[id].consumers.empty();
  }
  Node* GetNode(NodeId id) const final {
    if (id >= nodes_.size()) {
      return {};
    }
    return nodes_[id].node.get();
  }
  Value<TensorT>* GetValue(ValueId id) const final {
    if (id >= values_.size()) {
      return nullptr;
    }
    return values_[id].value.get();
  }
  Node* NewNode() final {
    NodeDef def;
    def.node =
        absl::make_unique<Node>(Node{static_cast<NodeId>(nodes_.size()), {}});
    Node* node = def.node.get();
    nodes_.push_back(std::move(def));
    return node;
  }
  Value<TensorT>* NewValue() final {
    ValueDef def;
    def.value = absl::make_unique<Value<TensorT>>(
        Value<TensorT>{static_cast<ValueId>(values_.size()), {}});
    Value<TensorT>* value = def.value.get();
    values_.push_back(std::move(def));
    return value;
  }
  std::vector<Value<TensorT>*> FindInputs(NodeId id) const final {
    if (id >= nodes_.size()) {
      return {};
    }
    return nodes_[id].inputs;
  }
  std::vector<Value<TensorT>*> FindOutputs(NodeId id) const final {
    if (id >= nodes_.size()) {
      return {};
    }
    return nodes_[id].outputs;
  }
  Node* FindProducer(ValueId id) const final {
    if (id >= values_.size()) {
      return nullptr;
    }
    return values_[id].producer;
  }
  std::vector<Node*> FindConsumers(ValueId id) const final {
    if (id >= values_.size()) {
      return {};
    }
    return values_[id].consumers;
  }
  Status SetProducer(NodeId producer, ValueId value) final {
    ValueDef* v;
    RETURN_IF_ERROR(LookupValue(value, &v));
    Value<TensorT>* value_ptr = v->value.get();
    NodeDef* n;
    RETURN_IF_ERROR(LookupNode(producer, &n));
    Node* node_ptr = n->node.get();
    // check if this value has the same producer already
    if (node_ptr == v->producer) {
      return InvalidArgumentError("Node is already a producer of the value");
    }
    // Check if the node is a consumer of this value.
    if (std::find(n->inputs.begin(), n->inputs.end(), value_ptr) !=
        n->inputs.end()) {
      return InvalidArgumentError("Node is a consumer of the value");
    }
    // TODO(akulik): detect circular dependency?
    if (v->producer != nullptr) {
      // value is no longer produced by it's previous producer.
      Erase(&nodes_[v->producer->id].outputs, value_ptr);
    }
    v->producer = node_ptr;
    n->outputs.push_back(value_ptr);
    return OkStatus();
  }
  Status RemoveProducer(ValueId value) final {
    ValueDef* v;
    RETURN_IF_ERROR(LookupValue(value, &v));
    Value<TensorT>* value_ptr = v->value.get();
    if (v->producer == nullptr) {
      return InvalidArgumentError("Value does not have a producer");
    }
    Erase(&nodes_[v->producer->id].outputs, value_ptr);
    v->producer = nullptr;
    return OkStatus();
  }
  Status AddConsumer(NodeId consumer, ValueId value) final {
    ValueDef* v;
    RETURN_IF_ERROR(LookupValue(value, &v));
    Value<TensorT>* value_ptr = v->value.get();
    NodeDef* n;
    RETURN_IF_ERROR(LookupNode(consumer, &n));
    Node* node_ptr = n->node.get();
    // check if this value has the same producer already
    if (node_ptr == v->producer) {
      return InvalidArgumentError("Node is a producer of the value");
    }
    // check if this value has the same consumer already
    if (std::find(n->inputs.begin(), n->inputs.end(), value_ptr) !=
        n->inputs.end()) {
      return InvalidArgumentError("Node is already a consumer of the value");
    }
    n->inputs.push_back(value_ptr);
    v->consumers.push_back(node_ptr);
    return OkStatus();
  }
  Status RemoveConsumer(NodeId consumer, ValueId value) final {
    ValueDef* v;
    RETURN_IF_ERROR(LookupValue(value, &v));
    Value<TensorT>* value_ptr = v->value.get();
    NodeDef* n;
    RETURN_IF_ERROR(LookupNode(consumer, &n));
    Node* node_ptr = n->node.get();
    if (std::find(n->inputs.begin(), n->inputs.end(), value_ptr) ==
        n->inputs.end()) {
      return InvalidArgumentError("Node is not a consumer of the value");
    }
    Erase(&n->inputs, value_ptr);
    Erase(&v->consumers, node_ptr);
    return OkStatus();
  }
  Status DeleteNode(NodeId id) final {
    NodeDef* n;
    RETURN_IF_ERROR(LookupNode(id, &n));
    Node* node_ptr = n->node.get();
    for (auto value : n->inputs) {
      Erase(&values_[value->id].consumers, node_ptr);
    }
    for (auto value : n->outputs) {
      values_[value->id].producer = nullptr;
    }
    n->inputs.clear();
    n->outputs.clear();
    n->node.reset();
    return OkStatus();
  }
  Status DeleteValue(ValueId id) final {
    ValueDef* v;
    RETURN_IF_ERROR(LookupValue(id, &v));
    Value<TensorT>* value_ptr = v->value.get();
    if (v->producer != nullptr) {
      Erase(&nodes_[v->producer->id].outputs, value_ptr);
    }
    if (!v->consumers.empty()) {
      for (auto node : v->consumers) {
        Erase(&nodes_[node->id].inputs, value_ptr);
      }
    }
    v->producer = nullptr;
    v->consumers.clear();
    v->value.reset();
    return OkStatus();
  }
  Status MakeExactCopy(Model<TensorT>* model) const {
    model->nodes_.clear();
    model->values_.clear();
    model->name_ = name_;
    for (auto& value_def : values_) {
      model->values_.push_back({});
      if (value_def.value) {
        model->values_.back().value =
            absl::make_unique<Value<TensorT>>(*value_def.value);
      }
    }
    for (auto& node_def : nodes_) {
      model->nodes_.push_back({});
      if (node_def.node) {
        model->nodes_.back().node = absl::make_unique<Node>(*node_def.node);
        for (auto output : node_def.outputs) {
          RETURN_IF_ERROR(model->SetProducer(node_def.node->id, output->id));
        }
        for (auto input : node_def.inputs) {
          RETURN_IF_ERROR(model->AddConsumer(node_def.node->id, input->id));
        }
      }
    }
    return OkStatus();
  }
 private:
  struct NodeDef {
    std::vector<Value<TensorT>*> inputs;
    std::vector<Value<TensorT>*> outputs;
    std::unique_ptr<Node> node;
  };
  struct ValueDef {
    Node* producer = nullptr;
    std::vector<Node*> consumers;
    std::unique_ptr<Value<TensorT>> value;
  };
  template <typename T>
  static void Erase(std::vector<T>* values, T value) {
    values->erase(std::find(values->begin(), values->end(), value));
  }
  // @return non-nullptr NodeDef that has valid Node or an error
  Status LookupNode(NodeId id, NodeDef** node_def) {
    if (id >= nodes_.size()) {
      return OutOfRangeError("NodeId is out of range");
    }
    auto& n = nodes_[id];
    if (!n.node) {
      return OutOfRangeError("Node is already deleted");
    }
    *node_def = &n;
    return OkStatus();
  }
  // @return non-nullptr ValueDef that has valid Value or an error
  Status LookupValue(ValueId id, ValueDef** value_def) {
    if (id >= values_.size()) {
      return OutOfRangeError("ValueId is out of range");
    }
    auto& v = values_[id];
    if (!v.value) {
      return OutOfRangeError("Value is already deleted");
    }
    *value_def = &v;
    return OkStatus();
  }
  template <typename Pred>
  std::vector<Value<TensorT>*> FilterValues(const Pred& predicate) const {
    std::vector<Value<TensorT>*> values;
    values.reserve(values_.size());
    for (auto& v : values_) {
      if (v.value != nullptr && predicate(v)) {
        values.push_back(v.value.get());
      }
    }
    return values;
  }
  template <typename Pred>
  std::vector<Node*> FilterNodes(const Pred& predicate) const {
    std::vector<Node*> nodes;
    nodes.reserve(nodes_.size());
    for (auto& n : nodes_) {
      if (n.node != nullptr && predicate(n)) {
        nodes.push_back(n.node.get());
      }
    }
    return nodes;
  }
  std::string name_;
  // There are two approaches possible: wrap entire NodeDef and ValueDef into
  // unique_ptr and store it in values_ and nodes_ or store it by value.
  // We store it by value here to make introspection calls cheaper.
  std::vector<ValueDef> values_;
  std::vector<NodeDef> nodes_;
 };
 // Removes to_remove node that precedes to_keep node only if to_remove has
 // outputs that are consumed only by to_keep. In such case to_keep inherits all
 // to_remove inputs.
 template <typename TensorT>
 Status RemovePrecedingNode(Graph<TensorT>* graph, const Node* to_remove,
                           const Node* to_keep) {
  // Make sure all outputs from to_remove are consumed by to_keep.
  for (auto output : graph->FindOutputs(to_remove->id)) {
    auto consumers = graph->FindConsumers(output->id);
    if (consumers.size() > 1 ||
        (consumers.size() == 1 && consumers[0] != to_keep)) {
      return InvalidArgumentError(
          "Output from to_remove node has other consumers");
    }
  }
  // Update all references
  for (auto input : graph->FindInputs(to_remove->id)) {
    RETURN_IF_ERROR(graph->AddConsumer(to_keep->id, input->id));
  }
  for (auto output : graph->FindOutputs(to_remove->id)) {
    RETURN_IF_ERROR(graph->DeleteValue(output->id));
  }
  return graph->DeleteNode(to_remove->id);
 }
 // Removes to_remove node that follows to_keep node only if to_remove has inputs
 // that are produced by to_keep. to_keep inherits all to_remove inputs.
 template <typename TensorT>
 Status RemoveFollowingNode(Graph<TensorT>* graph, const Node* to_remove,
                           const Node* to_keep) {
  // Make sure all inputs to to_remove are produced by to_keep.
  for (auto input : graph->FindInputs(to_remove->id)) {
    Node* producer = graph->FindProducer(input->id);
    if (producer->id != to_keep->id) {
      return InvalidArgumentError("To_remove node has other inputs");
    }
  }
  for (auto input : graph->FindInputs(to_remove->id)) {
    RETURN_IF_ERROR(graph->DeleteValue(input->id));
  }
  for (auto output : graph->FindOutputs(to_remove->id)) {
    RETURN_IF_ERROR(graph->SetProducer(to_keep->id, output->id));
  }
  return graph->DeleteNode(to_remove->id);
 }
 template <typename TensorT>
 Status AddOutput(Graph<TensorT>* graph, const Node* from_node,
                 Value<TensorT>** output) {
  auto link = graph->NewValue();
  RETURN_IF_ERROR(graph->SetProducer(from_node->id, link->id));
  *output = link;
  return OkStatus();
 }
 template <typename TensorT>
 Status ConnectTwoNodes(Graph<TensorT>* graph, const Node* from_node,
                       const Node* to_node, Value<TensorT>** output) {
  Value<TensorT>* link;
  RETURN_IF_ERROR(AddOutput(graph, from_node, &link));
  RETURN_IF_ERROR(graph->AddConsumer(to_node->id, link->id));
  *output = link;
  return OkStatus();
 }
 using GraphFloat32 = Model<TensorRef<BHWC>>;
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_H_
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
--- a/tensorflow/lite/delegates/gpu/common/model_builder.h
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.h
@ -0,0 +1,45 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_H_
 #include <cstdint>
 #include <string>
 #include "tensorflow/lite/context.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 namespace tflite {
 namespace gpu {
 // Validates which operations are supported and returns array of operations to
 // replace with GPU kernels. The caller must free the pointer on TfLiteIntArray.
 TfLiteIntArray* GetOpsToReplace(TfLiteContext* context);
 // Extracts TFLite delegate execution plan from the input TFLite context and
 // converts it into generic graph format.
 Status BuildModel(TfLiteContext* context,
                  const TfLiteDelegateParams* delegate_params,
                  GraphFloat32* graph);
 Status ConvertTfliteTensorToTensorRef(const TfLiteTensor& tflite_tensor,
                                      TensorRefFloat32* flow_tensor);
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_H_
--- a/tensorflow/lite/delegates/gpu/common/model_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_test.cc
@ -0,0 +1,275 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include <initializer_list>
 #include <vector>
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 namespace tflite {
 namespace gpu {
 namespace {
 using ::testing::UnorderedElementsAre;
 TEST(Model, SingleNode) {
  // graph_input -> node -> graph_output
  GraphFloat32 graph;
  Node* node = graph.NewNode();
  Value<TensorRefFloat32>* graph_input = graph.NewValue();
  Value<TensorRefFloat32>* graph_output = graph.NewValue();
  ASSERT_TRUE(graph.AddConsumer(node->id, graph_input->id).ok());
  ASSERT_TRUE(graph.SetProducer(node->id, graph_output->id).ok());
  EXPECT_THAT(graph.nodes(), UnorderedElementsAre(node));
  EXPECT_THAT(graph.values(), UnorderedElementsAre(graph_input, graph_output));
  EXPECT_THAT(graph.inputs(), UnorderedElementsAre(graph_input));
  EXPECT_THAT(graph.outputs(), UnorderedElementsAre(graph_output));
  EXPECT_THAT(graph.FindInputs(node->id), UnorderedElementsAre(graph_input));
  EXPECT_THAT(graph.FindOutputs(node->id), UnorderedElementsAre(graph_output));
  EXPECT_THAT(graph.FindConsumers(graph_input->id), UnorderedElementsAre(node));
  EXPECT_THAT(graph.FindProducer(graph_output->id), ::testing::Eq(node));
  EXPECT_THAT(graph.FindConsumers(graph_output->id), UnorderedElementsAre());
  EXPECT_THAT(graph.FindProducer(graph_input->id), ::testing::Eq(nullptr));
 }
 TEST(Model, SingleNodeMultipleOutputs) {
  // graph_input -> node -> (graph_output1, graph_output2)
  GraphFloat32 graph;
  Node* node = graph.NewNode();
  Value<TensorRefFloat32>* graph_input = graph.NewValue();
  Value<TensorRefFloat32>* graph_output1 = graph.NewValue();
  Value<TensorRefFloat32>* graph_output2 = graph.NewValue();
  ASSERT_TRUE(graph.AddConsumer(node->id, graph_input->id).ok());
  ASSERT_TRUE(graph.SetProducer(node->id, graph_output1->id).ok());
  ASSERT_TRUE(graph.SetProducer(node->id, graph_output2->id).ok());
  EXPECT_THAT(graph.FindOutputs(node->id),
              UnorderedElementsAre(graph_output1, graph_output2));
  EXPECT_THAT(graph.FindProducer(graph_output1->id), ::testing::Eq(node));
  EXPECT_THAT(graph.FindProducer(graph_output2->id), ::testing::Eq(node));
 }
 TEST(Model, SetSameConsumer) {
  GraphFloat32 graph;
  Node* node = graph.NewNode();
  Value<TensorRefFloat32>* graph_input = graph.NewValue();
  ASSERT_TRUE(graph.AddConsumer(node->id, graph_input->id).ok());
  EXPECT_FALSE(graph.AddConsumer(node->id, graph_input->id).ok());
 }
 TEST(Model, RemoveConsumer) {
  // (graph_input1, graph_input2) -> node
  GraphFloat32 graph;
  Node* node = graph.NewNode();
  Value<TensorRefFloat32>* graph_input1 = graph.NewValue();
  Value<TensorRefFloat32>* graph_input2 = graph.NewValue();
  ASSERT_TRUE(graph.AddConsumer(node->id, graph_input1->id).ok());
  ASSERT_TRUE(graph.AddConsumer(node->id, graph_input2->id).ok());
  EXPECT_THAT(graph.FindConsumers(graph_input1->id),
              UnorderedElementsAre(node));
  EXPECT_THAT(graph.FindConsumers(graph_input2->id),
              UnorderedElementsAre(node));
  EXPECT_THAT(graph.FindInputs(node->id),
              UnorderedElementsAre(graph_input1, graph_input2));
  EXPECT_THAT(graph.outputs(), UnorderedElementsAre());
  // Now remove graph_input1
  ASSERT_TRUE(graph.RemoveConsumer(node->id, graph_input1->id).ok());
  EXPECT_THAT(graph.FindConsumers(graph_input1->id), UnorderedElementsAre());
  EXPECT_THAT(graph.FindInputs(node->id), UnorderedElementsAre(graph_input2));
  EXPECT_THAT(graph.outputs(), UnorderedElementsAre(graph_input1));
  // Can not remove it twice
  ASSERT_FALSE(graph.RemoveConsumer(node->id, graph_input1->id).ok());
 }
 TEST(Model, SetSameProducer) {
  GraphFloat32 graph;
  Node* node = graph.NewNode();
  Value<TensorRefFloat32>* graph_output = graph.NewValue();
  ASSERT_TRUE(graph.SetProducer(node->id, graph_output->id).ok());
  EXPECT_FALSE(graph.SetProducer(node->id, graph_output->id).ok());
 }
 TEST(Model, RemoveProducer) {
  GraphFloat32 graph;
  Node* node = graph.NewNode();
  Value<TensorRefFloat32>* graph_output = graph.NewValue();
  ASSERT_TRUE(graph.SetProducer(node->id, graph_output->id).ok());
  EXPECT_THAT(graph.inputs(), UnorderedElementsAre());
  EXPECT_THAT(graph.FindProducer(graph_output->id), ::testing::Eq(node));
  ASSERT_TRUE(graph.RemoveProducer(graph_output->id).ok());
  EXPECT_THAT(graph.inputs(), UnorderedElementsAre(graph_output));
  EXPECT_THAT(graph.FindProducer(graph_output->id), ::testing::Eq(nullptr));
  // Can not remove producer twice
  ASSERT_FALSE(graph.RemoveProducer(graph_output->id).ok());
 }
 TEST(Model, CircularDependency) {
  {
    GraphFloat32 graph;
    Node* node = graph.NewNode();
    Value<TensorRefFloat32>* value = graph.NewValue();
    ASSERT_TRUE(graph.AddConsumer(node->id, value->id).ok());
    EXPECT_FALSE(graph.SetProducer(node->id, value->id).ok());
  }
  {
    GraphFloat32 graph;
    Node* node = graph.NewNode();
    Value<TensorRefFloat32>* value = graph.NewValue();
    ASSERT_TRUE(graph.SetProducer(node->id, value->id).ok());
    EXPECT_FALSE(graph.AddConsumer(node->id, value->id).ok());
  }
 }
 TEST(Model, ReassignValue) {
  // Before:
  //   graph_input  -> node1 -> graph_output
  //              \ -> node2
  GraphFloat32 graph;
  Node* node1 = graph.NewNode();
  Node* node2 = graph.NewNode();
  Value<TensorRefFloat32>* graph_input = graph.NewValue();
  Value<TensorRefFloat32>* graph_output = graph.NewValue();
  ASSERT_TRUE(graph.AddConsumer(node1->id, graph_input->id).ok());
  ASSERT_TRUE(graph.SetProducer(node1->id, graph_output->id).ok());
  ASSERT_TRUE(graph.AddConsumer(node2->id, graph_input->id).ok());
  // After:
  //   graph_input  -> node1
  //              \ -> node2 -> graph_output
  ASSERT_TRUE(graph.SetProducer(node2->id, graph_output->id).ok());
  EXPECT_THAT(graph.nodes(), UnorderedElementsAre(node1, node2));
  EXPECT_THAT(graph.FindInputs(node1->id), UnorderedElementsAre(graph_input));
  EXPECT_THAT(graph.FindInputs(node2->id), UnorderedElementsAre(graph_input));
  EXPECT_THAT(graph.FindOutputs(node1->id), UnorderedElementsAre());
  EXPECT_THAT(graph.FindOutputs(node2->id), UnorderedElementsAre(graph_output));
  EXPECT_THAT(graph.FindConsumers(graph_input->id),
              UnorderedElementsAre(node1, node2));
  EXPECT_THAT(graph.FindProducer(graph_output->id), ::testing::Eq(node2));
  EXPECT_THAT(graph.FindConsumers(graph_output->id), UnorderedElementsAre());
 }
 TEST(Model, DeleteValue) {
  // graph_input  -> node1 -> value -> node2 -> graph_output
  GraphFloat32 graph;
  Node* node1 = graph.NewNode();
  Node* node2 = graph.NewNode();
  Value<TensorRefFloat32>* graph_input = graph.NewValue();
  Value<TensorRefFloat32>* graph_output = graph.NewValue();
  Value<TensorRefFloat32>* value = graph.NewValue();
  ASSERT_TRUE(graph.AddConsumer(node1->id, graph_input->id).ok());
  ASSERT_TRUE(graph.SetProducer(node1->id, value->id).ok());
  ASSERT_TRUE(graph.AddConsumer(node2->id, value->id).ok());
  ASSERT_TRUE(graph.SetProducer(node2->id, graph_output->id).ok());
  EXPECT_THAT(graph.values(),
              UnorderedElementsAre(graph_input, graph_output, value));
  EXPECT_THAT(graph.FindConsumers(value->id), UnorderedElementsAre(node2));
  EXPECT_THAT(graph.FindProducer(value->id), ::testing::Eq(node1));
  EXPECT_THAT(graph.FindInputs(node2->id), UnorderedElementsAre(value));
  EXPECT_THAT(graph.FindOutputs(node1->id), UnorderedElementsAre(value));
  ASSERT_TRUE(graph.DeleteValue(value->id).ok());
  value = nullptr;
  EXPECT_THAT(graph.values(), UnorderedElementsAre(graph_input, graph_output));
  EXPECT_THAT(graph.FindInputs(node2->id), UnorderedElementsAre());
  EXPECT_THAT(graph.FindOutputs(node1->id), UnorderedElementsAre());
  ASSERT_TRUE(graph.DeleteValue(graph_input->id).ok());
  graph_input = nullptr;
  EXPECT_THAT(graph.values(), UnorderedElementsAre(graph_output));
  EXPECT_THAT(graph.inputs(), UnorderedElementsAre());
  EXPECT_THAT(graph.FindInputs(node1->id), UnorderedElementsAre());
  ASSERT_TRUE(graph.DeleteValue(graph_output->id).ok());
  graph_output = nullptr;
  EXPECT_THAT(graph.values(), UnorderedElementsAre());
  EXPECT_THAT(graph.outputs(), UnorderedElementsAre());
  EXPECT_THAT(graph.FindOutputs(node2->id), UnorderedElementsAre());
 }
 TEST(Model, DeleteNode) {
  // graph_input -> node1 -> value  -> node2 -> graph_output
  //                               \-> node3 -> graph_output2
  GraphFloat32 graph;
  Node* node1 = graph.NewNode();
  Node* node2 = graph.NewNode();
  Node* node3 = graph.NewNode();
  Value<TensorRefFloat32>* graph_input = graph.NewValue();
  Value<TensorRefFloat32>* graph_output = graph.NewValue();
  Value<TensorRefFloat32>* graph_output2 = graph.NewValue();
  Value<TensorRefFloat32>* value = graph.NewValue();
  ASSERT_TRUE(graph.AddConsumer(node1->id, graph_input->id).ok());
  ASSERT_TRUE(graph.SetProducer(node1->id, value->id).ok());
  ASSERT_TRUE(graph.AddConsumer(node2->id, value->id).ok());
  ASSERT_TRUE(graph.AddConsumer(node3->id, value->id).ok());
  ASSERT_TRUE(graph.SetProducer(node2->id, graph_output->id).ok());
  ASSERT_TRUE(graph.SetProducer(node3->id, graph_output2->id).ok());
  EXPECT_THAT(graph.nodes(), UnorderedElementsAre(node1, node2, node3));
  EXPECT_THAT(graph.inputs(), UnorderedElementsAre(graph_input));
  EXPECT_THAT(graph.outputs(),
              UnorderedElementsAre(graph_output, graph_output2));
  EXPECT_THAT(graph.FindConsumers(value->id),
              UnorderedElementsAre(node2, node3));
  EXPECT_THAT(graph.FindProducer(value->id), ::testing::Eq(node1));
  EXPECT_THAT(graph.FindInputs(node2->id), UnorderedElementsAre(value));
  EXPECT_THAT(graph.FindInputs(node3->id), UnorderedElementsAre(value));
  // graph_input  -> node1 -> value -> node2 -> graph_output
  // graph_output2
  ASSERT_TRUE(graph.DeleteNode(node3->id).ok());
  node3 = nullptr;
  EXPECT_THAT(graph.nodes(), UnorderedElementsAre(node1, node2));
  EXPECT_THAT(graph.inputs(), UnorderedElementsAre(graph_input, graph_output2));
  EXPECT_THAT(graph.outputs(),
              UnorderedElementsAre(graph_output, graph_output2));
  EXPECT_THAT(graph.FindConsumers(value->id), UnorderedElementsAre(node2));
  // value -> node2 -> graph_output
  // graph_input
  // graph_output2
  ASSERT_TRUE(graph.DeleteNode(node1->id).ok());
  node1 = nullptr;
  EXPECT_THAT(graph.nodes(), UnorderedElementsAre(node2));
  EXPECT_THAT(graph.inputs(),
              UnorderedElementsAre(value, graph_output2, graph_input));
  EXPECT_THAT(graph.outputs(),
              UnorderedElementsAre(graph_input, graph_output, graph_output2));
  EXPECT_THAT(graph.FindConsumers(value->id), UnorderedElementsAre(node2));
  EXPECT_THAT(graph.FindProducer(value->id), ::testing::Eq(nullptr));
  ASSERT_TRUE(graph.DeleteNode(node2->id).ok());
  node2 = nullptr;
  EXPECT_THAT(graph.nodes(), UnorderedElementsAre());
  EXPECT_THAT(graph.inputs(), UnorderedElementsAre(graph_output, graph_output2,
                                                   graph_input, value));
  EXPECT_THAT(graph.outputs(), UnorderedElementsAre(graph_output, graph_output2,
                                                    graph_input, value));
  EXPECT_THAT(graph.FindConsumers(value->id), UnorderedElementsAre());
  EXPECT_THAT(graph.FindProducer(value->id), ::testing::Eq(nullptr));
 }
 }  // namespace
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/common/model_transformer.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_transformer.cc
@ -0,0 +1,197 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include <deque>
 #include <string>
 #include <vector>
 #include "absl/strings/str_join.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 namespace tflite {
 namespace gpu {
 bool ModelTransformer::Apply(const std::string& name,
                             SequenceTransformation* transformation) {
  // Seed transformations with starting node. Each node may start a chain of
  // transformations.
  for (auto input : graph_->inputs()) {
    for (auto node : graph_->FindConsumers(input->id)) {
      AddNodeToProcess(node);
    }
  }
  while (!to_process_.empty()) {
    auto node = graph_->GetNode(to_process_.front());
    if (node) {
      if (!ApplyStartingWithNode(name, transformation, node)) {
        return false;
      }
    }
    to_process_.pop_front();
  }
  processed_.clear();
  return true;
 }
 bool ModelTransformer::Apply(const std::string& name,
                             NodeTransformation* transformation) {
  // Apply a transformation only to nodes that are present in the graph before
  // transformation.
  std::vector<NodeId> nodes;
  for (auto node : graph_->nodes()) {
    nodes.push_back(node->id);
  }
  for (auto node_id : nodes) {
    auto node = graph_->GetNode(node_id);
    if (!node) {
      continue;
    }
    auto result = transformation->ApplyToNode(node, graph_);
    if (result.status == TransformStatus::INVALID) {
      return false;
    }
    if (reporter_) {
      if (result.status == TransformStatus::APPLIED) {
        reporter_->AppliedTransformation(name, std::to_string(node_id),
                                         result.message);
      }
      if (result.status == TransformStatus::DECLINED) {
        reporter_->DeclinedTransformation(name, std::to_string(node_id),
                                          result.message);
      }
    }
  }
  return true;
 }
 bool ModelTransformer::ApplyStartingWithNode(
    const std::string& name, SequenceTransformation* transformation,
    Node* begin) {
  int expected_sequence_length = transformation->ExpectedSequenceLength();
  std::deque<NodeId> sequence;
  std::vector<Node*> nodes;
  nodes.reserve(transformation->ExpectedSequenceLength());
  sequence.push_back(begin->id);
  // Go over nodes with sequence sliding window of size
  // expected_sequence_length until a node with multiple dependents is found.
  while (true) {
    // Apply transformation if possible.
    if (sequence.size() == expected_sequence_length) {
      nodes.clear();
      for (NodeId id : sequence) {
        // Nodes present in sequence should be present in a graph. If they are
        // not, then this transformation changes a graph but didn't say it.
        Node* node = graph_->GetNode(id);
        if (node == nullptr) {
          return false;
        }
        nodes.push_back(node);
      }
      NodeId first_in_sequence = sequence.front();
      auto preceding_node =
          graph_->FindProducer(graph_->FindInputs(first_in_sequence)[0]->id);
      auto result = transformation->ApplyToNodesSequence(nodes, graph_);
      if (result.status == TransformStatus::INVALID) {
        // graph is broken now.
        return false;
      }
      if (result.status == TransformStatus::DECLINED) {
        if (reporter_) {
          reporter_->DeclinedTransformation(name, absl::StrJoin(sequence, "+"),
                                            result.message);
        }
      } else if (result.status == TransformStatus::APPLIED) {
        if (reporter_) {
          reporter_->AppliedTransformation(name, absl::StrJoin(sequence, "+"),
                                           result.message);
        }
        // Also remove first node of a sequence from a set of processed node.
        // Out of all nodes in a sequence only first one may have been added
        // to "processed" set because other nodes do not have more than one
        // dependent. However, if a sequence is changed, then processing needs
        // to be restarted again.
        processed_.erase(first_in_sequence);
        // Transformation was successful. Restart sequence from the node that
        // precedes current sequence.
        if (preceding_node) {
          processed_.erase(preceding_node->id);
          AddNodeToProcess(preceding_node);
        } else {
          // This is the first node in the graph. Re-seed transformation.
          for (auto input : graph_->inputs()) {
            for (auto node : graph_->FindConsumers(input->id)) {
              AddNodeToProcess(node);
            }
          }
        }
        return true;
      }
    }
    // Try to extend current sequence.
    Node* next_node_in_sequence = nullptr;
    bool has_multiple_children = false;
    // Check that all outputs from last node are consumed by a single node.
    for (auto output_value : graph_->FindOutputs(sequence.back())) {
      for (auto dependent : graph_->FindConsumers(output_value->id)) {
        if (has_multiple_children) {
          AddNodeToProcess(dependent);
        } else if (next_node_in_sequence == nullptr) {
          next_node_in_sequence = dependent;
        } else if (next_node_in_sequence != dependent) {
          // There are more than two nodes depend on the output from end node,
          // therefore here a sequence stops and new will start. Push all such
          // nodes.
          has_multiple_children = true;
          AddNodeToProcess(dependent);
          AddNodeToProcess(next_node_in_sequence);
        }
      }
    }
    // Now check that next node has inputs only produced by the last node.
    if (!has_multiple_children && next_node_in_sequence) {
      for (auto input : graph_->FindInputs(next_node_in_sequence->id)) {
        auto producer = graph_->FindProducer(input->id);
        if (producer == nullptr || producer->id != sequence.back()) {
          has_multiple_children = true;
          AddNodeToProcess(next_node_in_sequence);
          break;
        }
      }
    }
    if (has_multiple_children || next_node_in_sequence == nullptr) {
      // reached end of this transformation sequence.
      return true;
    }
    sequence.push_back(next_node_in_sequence->id);
    // Decrease sequence until it matches expected length.
    if (sequence.size() > expected_sequence_length) {
      sequence.pop_front();
    }
  }
  return true;
 }
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/common/model_transformer.h
+++ b/tensorflow/lite/delegates/gpu/common/model_transformer.h
@ -0,0 +1,146 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_TRANSFORMER_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_TRANSFORMER_H_
 #include <deque>
 #include <string>
 #include <unordered_set>
 #include <vector>
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 namespace tflite {
 namespace gpu {
 class TransformationReporter;
 struct TransformationContext {
  GraphFloat32* graph;
  TransformationReporter* reporter;
 };
 enum class TransformStatus {
  // Transformation was not applied due to trivial conditions mismatch.
  //
  // This is different from DECLINED code below that provides in-depth
  // explanation why a transformation that could have been applied but was not
  // due to some issues.
  SKIPPED,
  // Transformation was declined, therefore, a model was not modified.
  DECLINED,
  // Transformation was applied successfully
  APPLIED,
  // Transformation may partially be applied, but left a model in an invalid
  // state. This error should be considered unrecoverable.
  INVALID,
 };
 struct TransformResult {
  TransformStatus status;
  std::string message;
 };
 // Class responsible for applying a transformation to a single node.
 class NodeTransformation {
 public:
  virtual ~NodeTransformation() = default;
  virtual TransformResult ApplyToNode(Node* node, GraphFloat32* graph) = 0;
 };
 // Class responsible for applying a transformation to a sequence of nodes.
 // Nodes are guaranteed to depend on each other without extra dependents being
 // spilled.
 class SequenceTransformation {
 public:
  virtual ~SequenceTransformation() = default;
  // @return number of nodes in a sequence to apply this transformation.
  virtual int ExpectedSequenceLength() const = 0;
  // Applies transformations to a sequence of nodes. Transformation
  // implementation is free manipulate with sequence nodes including adding
  // and/or deleting nodes. if there were updates to nodes in the end and/or
  // beginning of the sequence, then referential consistency should be
  // maintained by updating relevant references in nodes that precede this
  // sequence or depend on a last node of the sequence.
  virtual TransformResult ApplyToNodesSequence(
      const std::vector<Node*>& sequence, GraphFloat32* graph) = 0;
 };
 // A class accumulated decisions or updates done by transformations.
 class TransformationReporter {
 public:
  virtual ~TransformationReporter() = default;
  virtual void DeclinedTransformation(const std::string& transformation,
                                      const std::string& node_ids,
                                      const std::string& message) = 0;
  virtual void AppliedTransformation(const std::string& transformation,
                                     const std::string& node_ids,
                                     const std::string& message) = 0;
 };
 // A class is designed to perform model transformations.
 class ModelTransformer {
 public:
  ModelTransformer(GraphFloat32* graph, TransformationReporter* reporter)
      : graph_(graph), reporter_(reporter) {}
  // @return false if a graph is in the broken states can not be used any more
  bool Apply(const std::string& name, SequenceTransformation* transformation);
  // @return false if a graph is in the broken states can not be used any more
  bool Apply(const std::string& name, NodeTransformation* transformation);
 private:
  bool ApplyStartingWithNode(const std::string& name,
                             SequenceTransformation* transformation,
                             Node* begin);
  void AddNodeToProcess(Node* node) {
    if (node && processed_.insert(node->id).second) {
      to_process_.push_back(node->id);
    }
  }
  GraphFloat32* graph_;
  TransformationReporter* reporter_;
  std::deque<NodeId> to_process_;
  std::unordered_set<NodeId> processed_;
 };
 class NullTransformationReporter : public TransformationReporter {
 public:
  void DeclinedTransformation(const std::string& transformation,
                              const std::string& nodes_id,
                              const std::string& message) override {}
  void AppliedTransformation(const std::string& transformation,
                             const std::string& nodes_id,
                             const std::string& message) override {}
 };
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_TRANSFORMER_H_
--- a/tensorflow/lite/delegates/gpu/common/operations.cc
+++ b/tensorflow/lite/delegates/gpu/common/operations.cc
@ -0,0 +1,396 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include <cstdint>
 #include <unordered_map>
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 namespace tflite {
 namespace gpu {
 Padding2D& Padding2D::operator=(const Padding2D& value) {
  prepended = value.prepended;
  appended = value.appended;
  return *this;
 }
 bool Padding2D::operator==(const Padding2D& value) {
  return this->prepended == value.prepended && this->appended == value.appended;
 }
 bool Padding2D::operator!=(const Padding2D& value) { return !(*this == value); }
 std::string ToString(enum OperationType op) {
  switch (op) {
    case OperationType::UNKNOWN:
      break;
    case OperationType::ABS:
      return "abs";
    case OperationType::ADD:
      return "add";
    case OperationType::APPLY_MASK:
      return "apply_mask";
    case OperationType::SUB:
      return "subtract";
    case OperationType::POOLING_2D:
      return "pooling_2d";
    case OperationType::MAX_UNPOOLING_2D:
      return "max_unpooling";
    case OperationType::BATCH_NORMALIZATION:
      return "batch_normalization";
    case OperationType::CONCAT:
      return "concat";
    case OperationType::CONST:
      return "const";
    case OperationType::CONVOLUTION_2D:
      return "convolution_2d";
    case OperationType::COS:
      return "cos";
    case OperationType::DEPTHWISE_CONVOLUTION:
      return "depthwise_convolution";
    case OperationType::LOG:
      return "log";
    case OperationType::MUL:
      return "mul";
    case OperationType::PAD:
      return "pad";
    case OperationType::PRELU:
      return "prelu";
    case OperationType::RELU:
      return "relu";
    case OperationType::RESIZE:
      return "resize";
    case OperationType::RESHAPE:
      return "reshape";
    case OperationType::RSQRT:
      return "rsqrt";
    case OperationType::SIGMOID:
      return "sigmoid";
    case OperationType::SIN:
      return "sin";
    case OperationType::SLICE:
      return "slice";
    case OperationType::SOFT_MAX:
      return "soft_max";
    case OperationType::SQRT:
      return "sqrt";
    case OperationType::SQUARE:
      return "square";
    case OperationType::UPSAMPLE_2D:
      return "upsample_2d";
    case OperationType::CONVOLUTION_TRANSPOSED:
      return "convolution_transposed";
    case OperationType::MULTIPLY_SCALAR:
      return "multiply_scalar";
    case OperationType::FULLY_CONNECTED:
      return "fully_connected";
    case OperationType::TANH:
      return "tanh";
    case OperationType::LSTM:
      return "lstm";
  }
  return "unknown_operation";
 }
 OperationType OperationTypeFromString(const std::string& name) {
  static const auto operations =
      new std::unordered_map<std::string, OperationType>({
          {"abs", OperationType::ABS},
          {"add", OperationType::ADD},
          {"apply_mask", OperationType::APPLY_MASK},
          {"batch_normalization", OperationType::BATCH_NORMALIZATION},
          {"concat", OperationType::CONCAT},
          {"const", OperationType::CONST},
          {"convolution_2d", OperationType::CONVOLUTION_2D},
          {"convolution_transposed", OperationType::CONVOLUTION_TRANSPOSED},
          {"cos", OperationType::COS},
          {"depthwise_convolution", OperationType::DEPTHWISE_CONVOLUTION},
          {"fully_connected", OperationType::FULLY_CONNECTED},
          {"log", OperationType::LOG},
          {"lstm", OperationType::LSTM},
          {"max_unpooling", OperationType::MAX_UNPOOLING_2D},
          {"mul", OperationType::MUL},
          {"multiply_scalar", OperationType::MULTIPLY_SCALAR},
          {"pad", OperationType::PAD},
          {"pooling_2d", OperationType::POOLING_2D},
          {"prelu", OperationType::PRELU},
          {"relu", OperationType::RELU},
          {"resize", OperationType::RESIZE},
          {"reshape", OperationType::RESHAPE},
          {"rsqrt", OperationType::RSQRT},
          {"sigmoid", OperationType::SIGMOID},
          {"sin", OperationType::SIN},
          {"slice", OperationType::SLICE},
          {"soft_max", OperationType::SOFT_MAX},
          {"sqrt", OperationType::SQRT},
          {"square", OperationType::SQUARE},
          {"subtract", OperationType::SUB},
          {"tanh", OperationType::TANH},
          {"upsample_2d", OperationType::UPSAMPLE_2D},
      });
  auto op = operations->find(name);
  return op == operations->end() ? OperationType::UNKNOWN : op->second;
 }
 namespace {
 template <typename T>
 T IntegralDivideRoundUp(T n, T divisor) {
  return (n - 1) / divisor + 1;
 }
 int32_t CalculateOutputSizeBeforeStrides(int32_t input, int32_t kernel,
                                         int32_t padding, int32_t dilation) {
  const int32_t dilated_kernel = (kernel - 1) * dilation + 1;
  return input + padding - dilated_kernel + 1;
 }
 template <Axis T>
 int32_t CalculateOutputWithoutStrides(const BHWC& input,
                                      const Convolution2DAttributes& attr) {
  return CalculateOutputSizeBeforeStrides(
      input.get<T>(), attr.weights.shape.get<T>(),
      attr.padding.prepended.get<T>() + attr.padding.appended.get<T>(),
      attr.dilations.get<T>());
 }
 template <Axis T>
 int32_t CalculateOutputWithoutStrides(const BHWC& input,
                                      const Pooling2DAttributes& attr) {
  return CalculateOutputSizeBeforeStrides(
      input.get<T>(), attr.kernel.get<T>(),
      attr.padding.prepended.get<T>() + attr.padding.appended.get<T>(),
      /*dilation=*/1);
 }
 template <Axis T>
 int32_t CalculateOutput(const BHWC& input,
                        const ConvolutionTransposedAttributes& attr) {
  return (input.get<T>() - 1) * attr.stride.get<T>() -
         (attr.padding.prepended.get<T>() + attr.padding.appended.get<T>()) +
         attr.weights.shape.get<T>() + attr.adjacent.get<T>();
 }
 inline int32_t StridedSize(int32_t size, int32_t stride) {
  return stride == 0 ? -1 : IntegralDivideRoundUp(size, stride);
 }
 template <Axis AxisT, typename AttrT>
 int32_t CalculateOutput(const BHWC& input, const AttrT& attr) {
  return StridedSize(CalculateOutputWithoutStrides<AxisT>(input, attr),
                     attr.strides.template get<AxisT>());
 }
 int32_t CalculateSamePadding(int32_t input, int32_t kernel, int32_t dilation,
                             int32_t stride) {
  const int32_t dilated_kernel = (kernel - 1) * dilation + 1;
  return std::max(0, dilated_kernel - (input - 1) % stride - 1);
 }
 // Returns a padding that should be present to make sure image size stays
 // the same.
 template <Axis AxisT>
 int32_t CalculateSamePadding(const BHWC& input,
                             const Convolution2DAttributes& attr) {
  return CalculateSamePadding(
      input.get<AxisT>(), attr.weights.shape.get<AxisT>(),
      attr.dilations.get<AxisT>(), attr.strides.get<AxisT>());
 }
 template <Axis AxisT>
 int32_t CalculateSamePadding(const BHWC& input,
                             const ConvolutionTransposedAttributes& attr) {
  return CalculateSamePadding(input.get<AxisT>(),
                              attr.weights.shape.get<AxisT>(),
                              /*dilation=*/1, attr.stride.get<AxisT>());
 }
 template <Axis AxisT>
 int32_t CalculateSamePadding(const BHWC& input,
                             const Pooling2DAttributes& attr) {
  return CalculateSamePadding(input.get<AxisT>(), attr.kernel.get<AxisT>(),
                              /*dilation=*/1, attr.strides.get<AxisT>());
 }
 template <Axis AxisT>
 int32_t CalculateSamePadding(const BHWC& input,
                             const MaxUnpooling2DAttributes& attr) {
  return CalculateSamePadding(input.get<AxisT>(), attr.kernel.get<AxisT>(),
                              /*dilation=*/1, attr.strides.get<AxisT>());
 }
 Padding2D MakeSamePadding(const BHWC& input,
                          const ConvolutionTransposedAttributes& attr) {
  int32_t padding_height = CalculateSamePadding<Axis::HEIGHT>(input, attr);
  int32_t padding_width = CalculateSamePadding<Axis::WIDTH>(input, attr);
  Padding2D padding;
  padding.prepended = HW(padding_height / 2, padding_width / 2);
  padding.appended = HW(padding_height - padding_height / 2,
                        padding_width - padding_width / 2);
  return padding;
 }
 // If padding depends on input, convert it into fixed padding.
 template <class AttrT>
 Padding2D MakeSamePadding(const BHWC& input, const AttrT& attr) {
  int32_t padding_height = CalculateSamePadding<Axis::HEIGHT>(input, attr);
  int32_t padding_width = CalculateSamePadding<Axis::WIDTH>(input, attr);
  Padding2D padding;
  padding.prepended = HW(padding_height / 2, padding_width / 2);
  padding.appended = HW(padding_height - padding_height / 2,
                        padding_width - padding_width / 2);
  return padding;
 }
 }  // namespace
 BHWC CalculateOutputShape(const BHWC& input,
                          const MaxUnpooling2DAttributes& attr) {
  return BHWC(input.b,
              input.h * attr.strides.h - attr.padding.prepended.h -
                  attr.padding.appended.h,
              input.w * attr.strides.w - attr.padding.prepended.w -
                  attr.padding.appended.w,
              input.c);
 }
 BHWC CalculateOutputShape(const BHWC& input, const Pooling2DAttributes& attr) {
  return BHWC(input.b, CalculateOutput<Axis::HEIGHT>(input, attr),
              CalculateOutput<Axis::WIDTH>(input, attr), input.c);
 }
 BHWC CalculateOutputShape(const BHWC& input,
                          const Convolution2DAttributes& attr) {
  return BHWC(input.b, CalculateOutput<Axis::HEIGHT>(input, attr),
              CalculateOutput<Axis::WIDTH>(input, attr),
              attr.weights.shape.get<Axis::OUTPUT_CHANNELS>());
 }
 BHWC CalculateOutputShape(const BHWC& input,
                          const ConvolutionTransposedAttributes& attr) {
  return BHWC(input.b, CalculateOutput<Axis::HEIGHT>(input, attr),
              CalculateOutput<Axis::WIDTH>(input, attr),
              attr.weights.shape.get<Axis::OUTPUT_CHANNELS>());
 }
 BHWC CalculateOutputShape(const BHWC& input,
                          const DepthwiseConvolution2DAttributes& attr) {
  return BHWC(input.b, CalculateOutput<Axis::HEIGHT>(input, attr),
              CalculateOutput<Axis::WIDTH>(input, attr),
              attr.weights.shape.get<Axis::OUTPUT_CHANNELS>() *
                  attr.weights.shape.get<Axis::INPUT_CHANNELS>());
 }
 BHWC CalculateOutputShape(const BHWC& input, const SliceAttributes& attr) {
  return BHWC(input.b, StridedSize(attr.ends.h - attr.starts.h, attr.strides.h),
              StridedSize(attr.ends.w - attr.starts.w, attr.strides.w),
              StridedSize(attr.ends.c - attr.starts.c, attr.strides.c));
 }
 BHWC CalculateOutputShape(const BHWC& input, const PadAttributes& attr) {
  return BHWC(input.b, attr.appended.h + attr.prepended.h + input.h,
              attr.appended.w + attr.prepended.w + input.w,
              attr.appended.c + attr.prepended.c + input.c);
 }
 BHWC CalculateOutputShape(const BHWC& input,
                          const FullyConnectedAttributes& attr) {
  return BHWC(input.b, 1, 1, attr.weights.shape.o);
 }
 Status CalculateOutputShape(const std::vector<BHWC>& input,
                            const ConcatAttributes& attr, BHWC* output_shape) {
  BHWC new_shape = input[0];
  switch (attr.axis) {
    case Axis::CHANNELS:
      for (int i = 1; i < input.size(); i++) {
        if (input[i].h != new_shape.h || input[i].w != new_shape.w) {
          return InvalidArgumentError(
              "Height and Width must be the same when concatenating "
              "by channels axis");
        }
        new_shape.c += input[i].c;
      }
      break;
    case Axis::HEIGHT:
      for (int i = 1; i < input.size(); i++) {
        if (input[i].w != new_shape.w || input[i].c != new_shape.c) {
          return InvalidArgumentError(
              "Channels and Width must be the same when concatenating "
              "by height axis");
        }
        new_shape.h += input[i].h;
      }
      break;
    case Axis::WIDTH:
      for (int i = 1; i < input.size(); i++) {
        if (input[i].h != new_shape.h || input[i].c != new_shape.c) {
          return InvalidArgumentError(
              "Height and Channels must be the same when concatenating "
              "by width axis");
        }
        new_shape.w += input[i].w;
      }
      break;
    default:
      return InvalidArgumentError("Invalid axis");
      break;
  }
  *output_shape = new_shape;
  return OkStatus();
 }
 Padding2D CalculateSamePadding(const BHWC& input,
                               const Convolution2DAttributes& attr) {
  return MakeSamePadding(input, attr);
 }
 Padding2D CalculateSamePadding(const BHWC& input,
                               const ConvolutionTransposedAttributes& attr) {
  return MakeSamePadding(input, attr);
 }
 Padding2D CalculateSamePadding(const BHWC& input,
                               const DepthwiseConvolution2DAttributes& attr) {
  return MakeSamePadding(input, attr);
 }
 Padding2D CalculateSamePadding(const BHWC& input,
                               const Pooling2DAttributes& attr) {
  return MakeSamePadding(input, attr);
 }
 Padding2D CalculateSamePadding(const BHWC& input,
                               const MaxUnpooling2DAttributes& attr) {
  return MakeSamePadding(input, attr);
 }
 float CalculateResizeScale(int32_t input_size, int32_t output_size,
                           const Upsample2DAttributes& attr) {
  return attr.align_corners && input_size > 1 && output_size > 1
             ? static_cast<float>(input_size - 1) / (output_size - 1)
             : static_cast<float>(input_size) / output_size;
 }
 BHWC CalculateOutputShape(const BHWC& input, const Upsample2DAttributes& attr) {
  return BHWC(input.b, attr.new_shape.h, attr.new_shape.w, input.c);
 }
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/common/operations.h
+++ b/tensorflow/lite/delegates/gpu/common/operations.h
@ -0,0 +1,319 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_OPERATIONS_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_OPERATIONS_H_
 #include <cstdint>
 #include <string>
 #include <vector>
 #include "absl/types/variant.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 namespace tflite {
 namespace gpu {
 // Non exhaustive list of operations.
 enum class OperationType {
  UNKNOWN = 0,
  ABS,
  ADD,
  // TODO(eignasheva): remove APPLY_MASK operation, is should be just MUL
  APPLY_MASK,
  BATCH_NORMALIZATION,
  CONCAT,
  CONST,
  CONVOLUTION_2D,
  CONVOLUTION_TRANSPOSED,
  COS,
  DEPTHWISE_CONVOLUTION,
  FULLY_CONNECTED,
  LOG,
  LSTM,
  MAX_UNPOOLING_2D,
  MUL,
  MULTIPLY_SCALAR,
  POOLING_2D,
  PAD,
  PRELU,
  RELU,
  RESHAPE,
  RESIZE,
  RSQRT,
  SIGMOID,
  SIN,
  SLICE,
  SOFT_MAX,
  SQRT,
  SQUARE,
  SUB,
  TANH,
  UPSAMPLE_2D,
 };
 std::string ToString(enum OperationType op);
 OperationType OperationTypeFromString(const std::string& name);
 struct Padding2D {
  Padding2D() = default;
  Padding2D& operator=(const Padding2D& value);
  bool operator==(const Padding2D& value);
  bool operator!=(const Padding2D& value);
  // Padding values for every axis (if needed), where 'prepended' defines
  // padding for the beginning of each axis and 'appended' represents end part
  // of the corresponding axis.
  HW prepended = HW(-1, -1);
  HW appended = HW(-1, -1);
 };
 enum class PoolingType {
  UNDEFINED = 0,
  // average pooling
  AVERAGE = 1,
  // max pooling
  MAX = 2,
 };
 struct Pooling2DAttributes {
  PoolingType type = PoolingType::UNDEFINED;
  // Strides for every axis.
  HW strides = HW(-1, -1);
  HW kernel = HW(-1, -1);
  Padding2D padding;
  // NOTE(akulik): technically the number of outputs from Pooling node indicates
  // whether indices are needed or not, but I decided to keep it inside
  // attributes to simplify processing.
  bool output_indices = false;
 };
 struct MaxUnpooling2DAttributes {
  // Strides for every axis.
  HW strides = HW(-1, -1);
  HW kernel = HW(-1, -1);
  Padding2D padding;
 };
 struct ConcatAttributes {
  // Defines axis by which to concat on.
  Axis axis = Axis::UNKNOWN;
 };
 // @return shape of a tensor after MaxUnpooling2D operation is applied to
 //         the given input.
 BHWC CalculateOutputShape(const BHWC& input,
                          const MaxUnpooling2DAttributes& attr);
 // @return shape of a tensor after Pooling2D operation is applied to the given
 //         input.
 BHWC CalculateOutputShape(const BHWC& input, const Pooling2DAttributes& attr);
 // @return shape of a tensor after Concat operation is applied to the given
 //         input.
 Status CalculateOutputShape(const std::vector<BHWC>& input,
                            const ConcatAttributes& attr, BHWC* output_shape);
 // @return padding for pooling operation to make sure output keep the same shape
 // as the given input.
 Padding2D CalculateSamePadding(const BHWC& input,
                               const Pooling2DAttributes& attr);
 // @return padding for max unpooling operation to make sure output keep the same
 // shape as the given input.
 Padding2D CalculateSamePadding(const BHWC& input,
                               const MaxUnpooling2DAttributes& attr);
 struct Convolution2DAttributes {
  HW strides = HW(1, 1);    // Along each axis.
  HW dilations = HW(1, 1);  // Along each axis.
  Padding2D padding;
  Tensor<OHWI, DataType::FLOAT32> weights;
  Tensor<Linear, DataType::FLOAT32> bias;  // optional
 };
 // @return shape of a tensor after Convolution2D operation is applied to
 //         the given input.
 BHWC CalculateOutputShape(const BHWC& input,
                          const Convolution2DAttributes& attr);
 // @return padding for convolution operation to make sure output keep the same
 // shape as the given input.
 Padding2D CalculateSamePadding(const BHWC& input,
                               const Convolution2DAttributes& attr);
 struct ConvolutionTransposedAttributes {
  HW stride = HW(1, 1);  // Along each axis.
  HW adjacent;           // TODO(sorokin): No op on Flow.
  Padding2D padding;
  Tensor<OHWI, DataType::FLOAT32> weights;
  Tensor<Linear, DataType::FLOAT32> bias;  // optional
 };
 Padding2D CalculateSamePadding(const BHWC& input,
                               const ConvolutionTransposedAttributes& attr);
 // @return shape of a tensor after ConvolutionTransposed operation is applied to
 //         the given input.
 BHWC CalculateOutputShape(const BHWC& input,
                          const ConvolutionTransposedAttributes& attr);
 struct DepthwiseConvolution2DAttributes : public Convolution2DAttributes {};
 // @return shape of a tensor after DepthwiseConvolution2D operation is applied
 //         to the given input.
 BHWC CalculateOutputShape(const BHWC& input,
                          const DepthwiseConvolution2DAttributes& attr);
 // @return padding for depthwise convolution operation to make sure output keep
 // the same shape as the given input.
 Padding2D CalculateSamePadding(const BHWC& input,
                               const DepthwiseConvolution2DAttributes& attr);
 BHWC CalculateOutputShape(const BHWC& input,
                          const DepthwiseConvolution2DAttributes& attr);
 // f(x):= {
 //   if x < 0  : x -> alpha * x
 //   if x >= 0 : x -> min(clip, x)
 // }
 //
 // Examples:
 //   - ReLU: clip = 0, alpha = 0
 //   - ReLU6: clip = 6, alpha = 0
 //   - Leaky ReLU: clip = 0, alpha = a
 struct ReLUAttributes {
  // clip <= 0 mean it is not set.
  float clip = 0;
  float alpha = 0;
 };
 struct PReLUAttributes {
  // clip <= 0 mean it is not set.
  float clip = 0;
  // If alpha is linear, then it is sharded across CHANNELS axis, otherwise
  // full shape alpha is required.
  absl::variant<Tensor<Linear, DataType::FLOAT32>,
                Tensor<HWC, DataType::FLOAT32>>
      alpha;
 };
 struct SoftMaxAttributes {
  Axis axis = Axis::UNKNOWN;
 };
 enum LstmKernelType {
  FULL = 0,
  BASIC = 1,  // Currently, only basic is supported.
 };
 struct LstmAttributes {
  LstmKernelType kernel_type = LstmKernelType::BASIC;
 };
 struct MultiplyScalarAttributes {
  absl::variant<absl::monostate, Tensor<Linear, DataType::FLOAT32>, float>
      param;
 };
 enum class UpsamplingType {
  NEAREST = 0,
  BILINEAR = 1,
 };
 struct Upsample2DAttributes {
  HW new_shape;
  UpsamplingType type = UpsamplingType::NEAREST;
  // If true, the centers of the 4 corner pixels of the input and output tensors
  // are aligned, preserving the values at the corner pixels. Defaults to false.
  bool align_corners = false;
 };
 float CalculateResizeScale(int32_t input_size, int32_t output_size,
                           const Upsample2DAttributes& attr);
 // @return shape of a tensor after upscale operation is applied to the given
 // input.
 BHWC CalculateOutputShape(const BHWC& input, const Upsample2DAttributes& attr);
 enum class PaddingContentType {
  ZEROS = 0,
  REFLECT = 1,
  EDGE = 2,
 };
 struct PadAttributes {
  PaddingContentType type = PaddingContentType::ZEROS;
  HWC prepended;
  HWC appended;
 };
 // @return shape of a tensor after Pad operation is applied to the given input.
 BHWC CalculateOutputShape(const BHWC& input, const PadAttributes& attr);
 struct ConstTensorAttributes {
  Tensor<BHWC, DataType::FLOAT32> tensor;
 };
 // Simple slicing without advanced support for shrinking, reverse slicing etc.
 struct SliceAttributes {
  // Specifies start and end dimensions for slicing.
  HWC starts;
  HWC ends;
  // Stride should be >= 1.
  HWC strides;
 };
 // @return shape of a tensor after Slice2D operation is applied to the given
 //         input.
 BHWC CalculateOutputShape(const BHWC& input, const SliceAttributes& attr);
 struct AddAttributes {
  absl::variant<absl::monostate, Tensor<Linear, DataType::FLOAT32>, float>
      param;
 };
 struct FullyConnectedAttributes {
  Tensor<OHWI, DataType::FLOAT32> weights;
  Tensor<Linear, DataType::FLOAT32> bias;
 };
 // @return shape of a tensor after FullyConnected operation is applied to
 // the given input.
 BHWC CalculateOutputShape(const BHWC& input,
                          const FullyConnectedAttributes& attr);
 struct ReshapeAttributes {
  BHWC new_shape;
 };
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_OPERATIONS_H_
--- a/tensorflow/lite/delegates/gpu/common/shape.cc
+++ b/tensorflow/lite/delegates/gpu/common/shape.cc
@ -0,0 +1,115 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 namespace tflite {
 namespace gpu {
 namespace {
 struct GetAxisByIndexFunc {
  template <Layout T>
  Axis operator()() const {
    return GetAxis<T>(index);
  }
  int32_t index;
 };
 struct GetIndexByAxisFunc {
  template <Layout T>
  int operator()() const {
    return GetAxisIndex<T>(axis);
  }
  Axis axis;
 };
 struct NumAxisFunc {
  template <Layout T>
  int operator()() const {
    return Size<T>();
  }
 };
 }  // namespace
 std::string ToString(Axis axis) {
  switch (axis) {
    case Axis::BATCH:
      return "batch";
    case Axis::CHANNELS:
      return "channels";
    case Axis::INPUT_CHANNELS:
      return "input_channels";
    case Axis::OUTPUT_CHANNELS:
      return "output_channels";
    case Axis::HEIGHT:
      return "height";
    case Axis::WIDTH:
      return "width";
    case Axis::VALUE:
      return "value";
    case Axis::UNKNOWN:
      return "unknown";
  }
  return "undefined";
 }
 std::string ToString(Layout layout) {
  switch (layout) {
    case Layout::SCALAR:
      return "scalar";
    case Layout::LINEAR:
      return "linear";
    case Layout::HW:
      return "hw";
    case Layout::CHW:
      return "chw";
    case Layout::HWC:
      return "hwc";
    case Layout::OHWI:
      return "ohwi";
    case Layout::IHWO:
      return "ihwo";
    case Layout::OIHW:
      return "oihw";
    case Layout::IOHW:
      return "iohw";
    case Layout::BHWC:
      return "bhwc";
    case Layout::UNKNOWN:
      return "unknown";
  }
  return "undefined";
 }
 Axis GetAxis(Layout layout, int32_t index) {
  return DispatchByLayout(layout, GetAxisByIndexFunc{index});
 }
 int GetAxisIndex(Layout layout, Axis axis) {
  return DispatchByLayout(layout, GetIndexByAxisFunc{axis});
 }
 int Size(Layout layout) { return DispatchByLayout(layout, NumAxisFunc()); }
 std::string ToString(const Shape& s) {
  return absl::StrCat("{", ToString(s.layout), ", {",
                      absl::StrJoin(s.dimensions, ", "), "}}");
 }
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/common/shape.h
+++ b/tensorflow/lite/delegates/gpu/common/shape.h
@ -0,0 +1,612 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SHAPE_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SHAPE_H_
 #include <sys/types.h>
 #include <algorithm>
 #include <array>
 #include <functional>
 #include <numeric>
 #include <string>
 #include <utility>
 #include <vector>
 namespace tflite {
 namespace gpu {
 enum class Axis {
  UNKNOWN = 0,
  CHANNELS = 1,
  INPUT_CHANNELS = 2,
  OUTPUT_CHANNELS = 3,
  HEIGHT = 4,
  WIDTH = 5,
  BATCH = 6,
  VALUE = 7,
 };
 std::string ToString(Axis t);
 // Layout represents axis order.
 enum class Layout {
  UNKNOWN = 0,
  SCALAR = 1,
  LINEAR = 2,
  HW = 3,
  CHW = 4,
  HWC = 5,
  OIHW = 6,
  OHWI = 7,
  IHWO = 8,
  IOHW = 9,
  BHWC = 10,
 };
 std::string ToString(Layout l);
 // Returns number of axis for the fixed layout.
 template <Layout T>
 constexpr int Size();
 // Returns number of axis for the given layout.
 int Size(Layout layout);
 // Returns Axis for the given index and fixed layout.
 template <Layout T>
 constexpr Axis GetAxis(int index);
 // Returns axis for the given layout and index.
 Axis GetAxis(Layout layout, int32_t index);
 // Returns axis index for the given axis and fixed layout.
 template <Layout T>
 constexpr int GetAxisIndex(Axis axis);
 // Returns axis index for the given layout and axis.
 int GetAxisIndex(Layout layout, Axis axis);
 // Stores Layout(axis set and order) and value for dimensions.
 struct Shape {
  Shape() : layout(Layout::UNKNOWN), dimensions() {}
  explicit Shape(Layout t) : layout(t), dimensions(Size(t)) {}
  Shape(Layout t, std::vector<int32_t> d)
      : layout(t), dimensions(std::move(d)) {}
  bool operator==(const Shape& other) const {
    return (layout == other.layout) && (dimensions == other.dimensions);
  }
  bool operator!=(const Shape& other) const { return !operator==(other); }
  // All methods below are matching same methods defined in StrongShape to
  // make sure generic algorithms work both ways.
  // Returns back a dimension or -1 if it is not found.
  template <Axis D>
  int32_t get() const;
  int32_t get(Axis d) const;
  template <Axis D>
  bool set(int32_t t);
  bool set(Axis d, int32_t t);
  Axis axis(int index) const { return GetAxis(layout, index); }
  int index(Axis d) const { return GetAxisIndex(layout, d); }
  int64_t DimensionsProduct() const {
    return std::accumulate(dimensions.begin(), dimensions.end(), 1ll,
                           std::multiplies<int64_t>());
  }
  Layout layout = Layout::UNKNOWN;
  std::vector<int32_t> dimensions;
 };
 std::string ToString(const Shape& s);
 // StrongShape provides convenient explicit access to dimensions stored in
 // shape, e.g. StrongShape<Layout::HW> s; provides s.h and s.w accessors.
 //
 // There is a conversion possible both ways between Shape and StrongShape.
 //
 //   OIHW oihw;  // specific shape
 //   Shape l = oihw.ToShape();
 //
 //   OHWI other;  // notice not the same but compatible shape.
 //   if (!other.Adopt(l)) {
 //     // error handling
 //   }
 //
 // StrongShape supports the following set of operations:
 //
 //   // Returns number of axis in the shape class.
 //   static constexpr int size();
 //
 //   // Returns Axis for the given index or Axis::UNKNOWN if index
 //   // falls outside of the defined range in this shape.
 //   static constexpr Axis axis(int index);
 //
 //   // Returns index for the given axis or -1 if axis is not defined in this
 //   // shape.
 //   static constexpr int index(Axis d);
 //
 //   // Getters
 //   int32_t get(int index) const;
 //   int32_t get(Axis d) const;
 //   int32_t get<Axis>() const;
 //
 //   // Setters that return false if set was not successful.
 //   bool set(int index, int32_t v);
 //   bool set(Axis d, int32_t v);
 //   bool set<Axis>(int32_t v);
 //
 //   // Returns shape's layout.
 //   static const Layout layout;
 //
 //   // Turns specific shape into generic shape.
 //   Shape ToShape() const;
 //
 //   // Copies all dimensions from the given shape.
 //   bool Adopt(const Shape&);
 //
 template <Layout L>
 struct StrongShape;
 using Scalar = StrongShape<Layout::SCALAR>;
 using Linear = StrongShape<Layout::LINEAR>;
 using HW = StrongShape<Layout::HW>;
 // Common tensor shape for CNN models working with images.
 using CHW = StrongShape<Layout::CHW>;
 using HWC = StrongShape<Layout::HWC>;
 using BHWC = StrongShape<Layout::BHWC>;
 // Tensor shape used in convolution_2d weights.
 using OIHW = StrongShape<Layout::OIHW>;
 using OHWI = StrongShape<Layout::OHWI>;
 using IHWO = StrongShape<Layout::IHWO>;
 using IOHW = StrongShape<Layout::IOHW>;
 // -----------------------------------------------------------------------------
 // Everything below are internal implementation details.
 // -----------------------------------------------------------------------------
 namespace internal_shape {
 template <Axis T>
 struct AxisTraits;
 #define TFLITE_GPU_AXIS_TRAITS(AxisName, HolderName)    \
  template <>                                           \
  struct AxisTraits<Axis::AxisName> {                   \
    struct Holder {                                     \
      int32_t HolderName;                               \
                                                        \
     protected:                                         \
      int32_t operator()() const { return HolderName; } \
      void operator()(int32_t v) { HolderName = v; }    \
    };                                                  \
                                                        \
    using dimension_holder_type = Holder;               \
  }
 TFLITE_GPU_AXIS_TRAITS(CHANNELS, c);
 TFLITE_GPU_AXIS_TRAITS(HEIGHT, h);
 TFLITE_GPU_AXIS_TRAITS(WIDTH, w);
 TFLITE_GPU_AXIS_TRAITS(INPUT_CHANNELS, i);
 TFLITE_GPU_AXIS_TRAITS(OUTPUT_CHANNELS, o);
 TFLITE_GPU_AXIS_TRAITS(BATCH, b);
 TFLITE_GPU_AXIS_TRAITS(VALUE, v);
 #undef TFLITE_GPU_AXIS_TRAITS
 template <int N, Axis... As>
 struct StrongShapeImpl;
 template <int N>
 struct StrongShapeImpl<N> {
  static constexpr int size() { return N; }
  static constexpr Axis axis(int) { return Axis::UNKNOWN; }
  static constexpr int index(Axis) { return -1; }
  int32_t get(Axis) const { return -1; }
  int32_t get(int) const { return -1; }
  template <Axis B>
  int32_t get() const {
    return -1;
  }
  bool set(Axis, int32_t) { return false; }
  bool set(int, int32_t) { return false; }
  template <Axis B>
  bool set(int32_t) {
    return false;
  }
 };
 // Used to deduce number of axis, and to be a child of a proper holder to
 // provide access to the dimension by name
 template <int N, Axis A, Axis... As>
 struct StrongShapeImpl<N, A, As...>
    : public AxisTraits<A>::dimension_holder_type,
      public StrongShapeImpl<N + 1, As...> {
  using dimension_holder_type = typename AxisTraits<A>::dimension_holder_type;
  using rest_type = StrongShapeImpl<N + 1, As...>;
  StrongShapeImpl() : dimension_holder_type{0}, rest_type() {}
  template <typename... Ts>
  explicit StrongShapeImpl(int32_t t, Ts... ts)
      : dimension_holder_type{t}, rest_type(ts...) {}
  static constexpr Axis axis(int index) {
    return index == N ? A : rest_type::axis(index);
  }
  static constexpr int index(Axis d) {
    return d == A ? N : rest_type::index(d);
  }
  int32_t get(Axis d) const {
    return d == A ? dimension_holder_type::operator()() : rest_type::get(d);
  }
  template <Axis B>
  int32_t get() const {
    return B == A ? dimension_holder_type::operator()()
                  : rest_type::template get<B>();
  }
  int32_t get(int index) const {
    return index == N ? dimension_holder_type::operator()()
                      : rest_type::get(index);
  }
  bool set(Axis d, int32_t t) {
    if (d == A) {
      dimension_holder_type::operator()(t);
      return true;
    }
    return rest_type::set(d, t);
  }
  bool set(int index, int32_t t) {
    if (index == N) {
      dimension_holder_type::operator()(t);
      return true;
    }
    return rest_type::set(index, t);
  }
  template <Axis B>
  bool set(int32_t t) {
    if (A == B) {
      dimension_holder_type::operator()(t);
      return true;
    }
    return rest_type::template set<B>(t);
  }
 };
 template <Layout T>
 struct LayoutTraits;
 #define TFLITE_GPU_LAYOUT_TRAITS(LayoutName, ...)              \
  template <>                                                  \
  struct LayoutTraits<Layout::LayoutName> {                    \
    using strong_shape_type = StrongShapeImpl<0, __VA_ARGS__>; \
  }
 TFLITE_GPU_LAYOUT_TRAITS(HW, Axis::HEIGHT, Axis::WIDTH);
 TFLITE_GPU_LAYOUT_TRAITS(OHWI, Axis::OUTPUT_CHANNELS, Axis::HEIGHT, Axis::WIDTH,
                         Axis::INPUT_CHANNELS);
 TFLITE_GPU_LAYOUT_TRAITS(OIHW, Axis::OUTPUT_CHANNELS, Axis::INPUT_CHANNELS,
                         Axis::HEIGHT, Axis::WIDTH);
 TFLITE_GPU_LAYOUT_TRAITS(IOHW, Axis::INPUT_CHANNELS, Axis::OUTPUT_CHANNELS,
                         Axis::HEIGHT, Axis::WIDTH);
 TFLITE_GPU_LAYOUT_TRAITS(IHWO, Axis::INPUT_CHANNELS, Axis::HEIGHT, Axis::WIDTH,
                         Axis::OUTPUT_CHANNELS);
 TFLITE_GPU_LAYOUT_TRAITS(CHW, Axis::CHANNELS, Axis::HEIGHT, Axis::WIDTH);
 TFLITE_GPU_LAYOUT_TRAITS(HWC, Axis::HEIGHT, Axis::WIDTH, Axis::CHANNELS);
 TFLITE_GPU_LAYOUT_TRAITS(LINEAR, Axis::VALUE);
 TFLITE_GPU_LAYOUT_TRAITS(SCALAR, Axis::VALUE);
 TFLITE_GPU_LAYOUT_TRAITS(BHWC, Axis::BATCH, Axis::HEIGHT, Axis::WIDTH,
                         Axis::CHANNELS);
 #undef TFLITE_GPU_LAYOUT_TRAITS
 template <>
 struct LayoutTraits<Layout::UNKNOWN> {
  using strong_shape_type = StrongShapeImpl<0>;
 };
 template <Axis A>
 struct DimensionGetterFixedAxisFunc {
  template <Layout T>
  int32_t operator()() const {
    constexpr int i = GetAxisIndex<T>(A);
    return i >= 0 && i < l->dimensions.size() ? l->dimensions[i] : -1;
  }
  const Shape* l;
 };
 struct DimensionGetterFunc {
  template <Layout T>
  int32_t operator()() const {
    int i = GetAxisIndex<T>(d);
    return i >= 0 && i < l->dimensions.size() ? l->dimensions[i] : -1;
  }
  Axis d;
  const Shape* l;
 };
 template <Axis A>
 struct DimensionSetterFixedAxisFunc {
  template <Layout T>
  bool operator()() const {
    constexpr int i = GetAxisIndex<T>(A);
    if (i >= 0 && i < l->dimensions.size()) {
      l->dimensions[i] = v;
      return true;
    }
    return false;
  }
  Shape* l;
  int32_t v;
 };
 struct DimensionSetterFunc {
  template <Layout T>
  bool operator()() const {
    int i = GetAxisIndex<T>(d);
    if (i >= 0 && i < l->dimensions.size()) {
      l->dimensions[i] = v;
      return true;
    }
    return false;
  }
  Axis d;
  Shape* l;
  int32_t v;
 };
 template <Layout L>
 struct ToShapeFunc {
  template <Layout T>
  bool operator()() const {
    for (int i = 0; i < StrongShape<L>::size(); ++i) {
      int index = GetAxisIndex<T>(StrongShape<L>::axis(i));
      if (index < 0) return false;
      shape->set(i, l.dimensions[index]);
    }
    return true;
  }
  StrongShape<L>* shape;
  const Shape& l;
 };
 }  // namespace internal_shape
 // template <Axis... As>
 template <Layout L>
 struct StrongShape : public internal_shape::LayoutTraits<L>::strong_shape_type {
  using strong_shape_type =
      typename internal_shape::LayoutTraits<L>::strong_shape_type;
  StrongShape() = default;
  template <typename... Ts>
  explicit StrongShape(Ts... t) : strong_shape_type(t...) {}
  constexpr static Layout layout = L;
  bool operator==(const StrongShape<L>& shape) const {
    // TODO(akulik): implement better alternative.
    return this->ToShape() == shape.ToShape();
  }
  bool operator!=(const StrongShape<L>& shape) const {
    // TODO(akulik): implement better alternative.
    return this->ToShape() != shape.ToShape();
  }
  bool empty() const { return DimensionsProduct() == 0; }
  // Turns StrongShape into generic shape.
  Shape ToShape() const {
    std::vector<int32_t> dimensions(StrongShape::size());
    for (int i = 0; i < StrongShape::size(); ++i) {
      dimensions[i] = StrongShape::get(i);
    }
    return Shape(L, std::move(dimensions));
  }
  // @return all dimensions multiplied
  int64_t DimensionsProduct() const {
    int64_t product = 1;
    for (int i = 0; i < StrongShape::size(); ++i) {
      product *= StrongShape::get(i);
    }
    return product;
  }
  // Translates given coordinates of the layout into a linear index assuming
  // dimensions are sorted in tensor access order e.g. if you access
  // foobar[i][j][k] order of coordinates should be i,j,k.
  int64_t LinearIndex(
      const std::array<int32_t, StrongShape::size()>& coordinates) const {
    int64_t index = coordinates[0];
    for (int i = 1; i < StrongShape::size(); ++i) {
      index = index * StrongShape::get(i) + coordinates[i];
    }
    return index;
  }
  // Copies all dimensions from the given generic shape into specific shape.
  // It requires shape to have all axis defined in the given
  // StrongShape. For example:
  //   - If this shape is OHWI but given shape is OIHW, Adopt will copy all
  //     dimensions and return true.
  //   - If this shape is OIHW but input shape is HW, Adopt will copy H and W
  //     dimensions and return true, but if this shape is HW and given shape
  //     OIHW, then Adopt will return false because not all axis are present in
  //     the input shape.
  //
  // @return false if generic shape is not compatible.
  bool Adopt(const Shape& shape) {
    return DispatchByLayout(shape.layout,
                            internal_shape::ToShapeFunc<L>{this, shape});
  }
  // For all axis defined in a given shape copies values to this shape.
  // Therefore, it is possible to copy dimensions from CHW to BCHW, but not
  // the other way around.
  //
  // BCHW bchw;
  // CHW chw;
  // bchw.CopyAllGivenAxis(chw);  --> true
  // chw.CopyAllGivenAxis(bchw);  --> false
  //
  // @return false if axis in source shape is not defined here, thus value
  //         was not copied.
  template <Layout B>
  bool CopyAllGivenAxis(const StrongShape<B>& source) {
    for (int i = 0; i < source.size(); ++i) {
      if (!StrongShape::set(source.axis(i), source.get(i))) {
        return false;
      }
    }
    return true;
  }
  // For all axis defined in this shape copies values from the given shape.
  //
  // BCHW bchw;
  // CHW chw;
  // bchw.CopyAllDefinedAxis(chw);  --> false
  // chw.CopyAllDefinedAxis(bchw);  --> true
  //
  // @return false if given shape does not have axis defined here,
  //         therefore a value was not copied.
  template <Layout B>
  bool CopyAllDefinedAxis(const StrongShape<B>& source) {
    for (int i = 0; i < StrongShape::size(); ++i) {
      int source_index = source.index(StrongShape::axis(i));
      if (source_index < 0) {
        return false;
      }
      StrongShape::set(i, source.get(source_index));  // always true
    }
    return true;
  }
  // Copies values only for matching axis.
  template <Layout B>
  void CopyMatchingAxis(const StrongShape<B>& source) {
    for (int i = 0; i < StrongShape::size(); ++i) {
      StrongShape::set(source.axis(i), source.get(i));
    }
  }
 };
 template <Layout T>
 inline std::string ToString(const StrongShape<T>& s) {
  return ToString(s.ToShape());
 }
 template <Layout L>
 constexpr Layout StrongShape<L>::layout;
 template <class F>
 auto DispatchByLayout(Layout type, F f)
    -> decltype(f.template operator()<Layout::UNKNOWN>()) {
  switch (type) {
    case Layout::HW:
      return f.template operator()<Layout::HW>();
    case Layout::HWC:
      return f.template operator()<Layout::HWC>();
    case Layout::CHW:
      return f.template operator()<Layout::CHW>();
    case Layout::OIHW:
      return f.template operator()<Layout::OIHW>();
    case Layout::IOHW:
      return f.template operator()<Layout::IOHW>();
    case Layout::OHWI:
      return f.template operator()<Layout::OHWI>();
    case Layout::IHWO:
      return f.template operator()<Layout::IHWO>();
    case Layout::LINEAR:
      return f.template operator()<Layout::LINEAR>();
    case Layout::SCALAR:
      return f.template operator()<Layout::SCALAR>();
    case Layout::BHWC:
      return f.template operator()<Layout::BHWC>();
    case Layout::UNKNOWN:
      return f.template operator()<Layout::UNKNOWN>();
  }
 }
 template <Layout T>
 constexpr int Size() {
  return StrongShape<T>::size();
 }
 template <Layout T>
 constexpr Axis GetAxis(int index) {
  return StrongShape<T>::axis(index);
 }
 template <Layout T>
 constexpr int GetAxisIndex(Axis axis) {
  return StrongShape<T>::index(axis);
 }
 template <Axis D>
 inline int32_t Shape::get() const {
  return DispatchByLayout(
      layout, internal_shape::DimensionGetterFixedAxisFunc<D>{this});
 }
 inline int32_t Shape::get(Axis d) const {
  return DispatchByLayout(layout, internal_shape::DimensionGetterFunc{d, this});
 }
 template <Axis D>
 inline bool Shape::set(int32_t t) {
  return DispatchByLayout(
      layout, internal_shape::DimensionSetterFixedAxisFunc<D>{this, t});
 }
 inline bool Shape::set(Axis d, int32_t t) {
  return DispatchByLayout(layout,
                          internal_shape::DimensionSetterFunc{d, this, t});
 }
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SHAPE_H_
--- a/tensorflow/lite/delegates/gpu/common/shape_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/shape_test.cc
@ -0,0 +1,123 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include <initializer_list>
 #include <vector>
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 namespace tflite {
 namespace gpu {
 namespace {
 TEST(OIHW, Smoke) {
  OIHW OIHW;
  // Test 4 different versions of setters.
  OIHW.i = 1;
  ASSERT_TRUE(OIHW.set<Axis::OUTPUT_CHANNELS>(2));
  ASSERT_TRUE(OIHW.set(Axis::HEIGHT, 3));
  ASSERT_TRUE(OIHW.set(3, 4));
  // Make sure invalid setters return false.
  ASSERT_FALSE(OIHW.set(5, 10));
  ASSERT_FALSE(OIHW.set(Axis::CHANNELS, 10));
  ASSERT_FALSE(OIHW.set<Axis::CHANNELS>(10));
  // Test 4 different versions of getters
  EXPECT_EQ(1, OIHW.get(Axis::INPUT_CHANNELS));
  EXPECT_EQ(2, OIHW.o);
  EXPECT_EQ(3, OIHW.get(2));
  EXPECT_EQ(4, OIHW.get<Axis::WIDTH>());
  // Make sure getters that fall outside of a range return invalid axis.
  EXPECT_EQ(-1, OIHW.get(5));
  EXPECT_EQ(-1, OIHW.get(Axis::CHANNELS));
  EXPECT_EQ(-1, OIHW.get<Axis::CHANNELS>());
  // Check axis indices are all correct.
  ASSERT_EQ(4, OIHW.size());
  std::vector<Axis> expected = {Axis::OUTPUT_CHANNELS, Axis::INPUT_CHANNELS,
                                Axis::HEIGHT, Axis::WIDTH};
  for (int i = 0; i < OIHW.size(); ++i) {
    Axis axis = OIHW.axis(i);
    ASSERT_EQ(expected[i], axis);
    ASSERT_EQ(i, OIHW.index(axis));
  }
  // Check equivalent conversions.
  OHWI ohwi;
  ASSERT_TRUE(ohwi.CopyAllDefinedAxis(OIHW));
  EXPECT_EQ(ohwi.o, OIHW.o);
  EXPECT_EQ(ohwi.i, OIHW.i);
  EXPECT_EQ(ohwi.h, OIHW.h);
  EXPECT_EQ(ohwi.w, OIHW.w);
  ohwi = OHWI(10, 20, 30, 40);
  ASSERT_TRUE(OIHW.CopyAllGivenAxis(ohwi));
  EXPECT_EQ(ohwi.o, OIHW.o);
  EXPECT_EQ(ohwi.i, OIHW.i);
  EXPECT_EQ(ohwi.h, OIHW.h);
  EXPECT_EQ(ohwi.w, OIHW.w);
 }
 TEST(Layout, Smoke) {
  EXPECT_EQ(4, Size<Layout::OIHW>());
  EXPECT_EQ(4, Size(Layout::OIHW));
  std::vector<Axis> expected = {Axis::OUTPUT_CHANNELS, Axis::INPUT_CHANNELS,
                                Axis::HEIGHT, Axis::WIDTH};
  for (int i = 0; i < Size<Layout::OIHW>(); ++i) {
    Axis axis = GetAxis<Layout::OIHW>(i);
    ASSERT_EQ(expected[i], axis);
    ASSERT_EQ(axis, GetAxis(Layout::OIHW, i));
    ASSERT_EQ(i, GetAxisIndex<Layout::OIHW>(axis));
    ASSERT_EQ(i, GetAxisIndex(Layout::OIHW, axis));
  }
  EXPECT_EQ(Axis::UNKNOWN, GetAxis(Layout::OIHW, 5));
  EXPECT_EQ(-1, GetAxisIndex<Layout::OIHW>(Axis::CHANNELS));
  EXPECT_EQ(-1, GetAxisIndex<Layout::OIHW>(Axis::CHANNELS));
 }
 TEST(Shape, Smoke) {
  Shape s(Layout::OIHW, {1, 2, 3, 4});
  EXPECT_TRUE(s.set(Axis::HEIGHT, 10));
  EXPECT_TRUE(s.set<Axis::WIDTH>(20));
  EXPECT_FALSE(s.set(Axis::BATCH, 10));
  EXPECT_FALSE(s.set<Axis::BATCH>(20));
  ASSERT_EQ(10, s.get<Axis::HEIGHT>());
  ASSERT_EQ(20, s.get(Axis::WIDTH));
  EXPECT_EQ(20, s.dimensions[3]);
  OIHW oihw(1, 2, 10, 20);
  Shape s2 = oihw.ToShape();
  EXPECT_EQ(s2.layout, oihw.layout);
  EXPECT_EQ(s.layout, s2.layout);
  EXPECT_EQ(s.dimensions, s2.dimensions);
  // Convert layout into compatible shape.
  OHWI ohwi;
  ASSERT_TRUE(ohwi.Adopt(s2));
  EXPECT_EQ(1, ohwi.o);
  EXPECT_EQ(2, ohwi.i);
  EXPECT_EQ(10, ohwi.h);
  EXPECT_EQ(20, ohwi.w);
 }
 }  // namespace
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/common/status.h
+++ b/tensorflow/lite/delegates/gpu/common/status.h
@ -0,0 +1,124 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_STATUS_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_STATUS_H_
 #include <string>
 namespace tflite {
 namespace gpu {
 enum class StatusCode {
  kOk = 0,
  kCancelled = 1,
  kUnknown = 2,
  kInvalidArgument = 3,
  kDeadlineExceeded = 4,
  kNotFound = 5,
  kAlreadyExists = 6,
  kPermissionDenied = 7,
  kResourceExhausted = 8,
  kFailedPrecondition = 9,
  kAborted = 10,
  kOutOfRange = 11,
  kUnimplemented = 12,
  kInternal = 13,
  kUnavailable = 14,
  kDataLoss = 15,
  kUnauthenticated = 16,
  kDoNotUseReservedForFutureExpansionUseDefaultInSwitchInstead_ = 20
 };
 // Lite version of Status without dependency on protobuf.
 // TODO(b/128867901): Migrate to absl::Status.
 class Status {
 public:
  Status() = default;
  Status(StatusCode code) : code_(code) {}
  Status(StatusCode code, const std::string& error_message)
      : code_(code), error_message_(error_message) {}
  const std::string& error_message() const { return error_message_; }
  StatusCode code() const { return code_; }
  bool ok() const { return code_ == StatusCode::kOk; }
  void IgnoreError() const {}
 private:
  StatusCode code_ = StatusCode::kOk;
  std::string error_message_;
 };
 #define RETURN_IF_ERROR(status)        \
  {                                    \
    const auto status2 = (status);     \
    if (!status2.ok()) return status2; \
  }
 inline Status OkStatus() { return Status(); }
 inline Status AlreadyExistsError(const std::string& message) {
  return Status(StatusCode::kAlreadyExists, message);
 }
 inline Status DeadlineExceededError(const std::string& message) {
  return Status(StatusCode::kDeadlineExceeded, message);
 }
 inline Status FailedPreconditionError(const std::string& message) {
  return Status(StatusCode::kFailedPrecondition, message);
 }
 inline Status InternalError(const std::string& message) {
  return Status(StatusCode::kInternal, message);
 }
 inline Status InvalidArgumentError(const std::string& message) {
  return Status(StatusCode::kInvalidArgument, message);
 }
 inline Status NotFoundError(const std::string& message) {
  return Status(StatusCode::kNotFound, message);
 }
 inline Status OutOfRangeError(const std::string& message) {
  return Status(StatusCode::kOutOfRange, message);
 }
 inline Status PermissionDeniedError(const std::string& message) {
  return Status(StatusCode::kPermissionDenied, message);
 }
 inline Status ResourceExhaustedError(const std::string& message) {
  return Status(StatusCode::kResourceExhausted, message);
 }
 inline Status UnavailableError(const std::string& message) {
  return Status(StatusCode::kUnavailable, message);
 }
 inline Status UnimplementedError(const std::string& message) {
  return Status(StatusCode::kUnimplemented, message);
 }
 inline Status UnknownError(const std::string& message) {
  return Status(StatusCode::kUnknown, message);
 }
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_STATUS_H_
--- a/tensorflow/lite/delegates/gpu/common/tensor.h
+++ b/tensorflow/lite/delegates/gpu/common/tensor.h
@ -0,0 +1,94 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TENSOR_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TENSOR_H_
 #include <string>
 #include <vector>
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 namespace tflite {
 namespace gpu {
 namespace internal_tensor {
 // Meta function given element type returns a type for Tensor data container.
 template <DataType Type>
 struct StorageType;
 template <>
 struct StorageType<DataType::FLOAT32> {
  using value = std::vector<float>;
 };
 template <>
 struct StorageType<DataType::INT32> {
  using value = std::vector<int32_t>;
 };
 }  // namespace internal_tensor
 template <typename ShapeT, DataType Type>
 struct Tensor {
  using ShapeType = ShapeT;
  constexpr static DataType kType = Type;
  using TensorStorageType = typename internal_tensor::StorageType<Type>::value;
  // Opaque id of a tensor.
  int64_t id = -1;
  ShapeType shape;
  TensorStorageType data;
 };
 // TensorRef is a reference to another tensor. If an object should never hold
 // tensor data, then TensorRef should be used instead.
 template <typename ShapeT>
 struct TensorRef {
  using ShapeType = ShapeT;
  DataType type = DataType::UNKNOWN;
  ShapeT shape;
  // Opaque reference to a tensor. Upstream component is responsible for
  // resolving this reference into an actual tensor.
  int64_t ref = -1;
 };
 template <typename ShapeT, DataType Type>
 constexpr DataType Tensor<ShapeT, Type>::kType;
 template <typename ShapeT, DataType Type>
 Tensor<ShapeT, Type> MakeZeroTensor(const ShapeT& shape) {
  Tensor<ShapeT, Type> tensor;
  tensor.shape = shape;
  tensor.data = typename Tensor<ShapeT, Type>::TensorStorageType(
      shape.DimensionsProduct(), 0);
  return tensor;
 }
 using TensorFloat32 = Tensor<BHWC, DataType::FLOAT32>;
 using TensorRefFloat32 = TensorRef<BHWC>;
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TENSOR_H_
--- a/tensorflow/lite/delegates/gpu/common/transformations/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/transformations/BUILD
@ -0,0 +1,203 @@
 package(default_visibility = ["//visibility:public"])
 licenses(["notice"])  # Apache 2.0
 cc_library(
    name = "add_bias",
    srcs = ["add_bias.cc"],
    hdrs = ["add_bias.h"],
    deps = [
        "//tensorflow/lite/delegates/gpu/common:data_type",
        "//tensorflow/lite/delegates/gpu/common:model",
        "//tensorflow/lite/delegates/gpu/common:model_transformer",
        "//tensorflow/lite/delegates/gpu/common:operations",
        "//tensorflow/lite/delegates/gpu/common:status",
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/strings",
        "@com_google_absl//absl/types:any",
    ],
 )
 cc_library(
    name = "fuse_add_to_conv",
    srcs = ["fuse_add_to_conv.cc"],
    hdrs = ["fuse_add_to_conv.h"],
    deps = [
        "//tensorflow/lite/delegates/gpu/common:data_type",
        "//tensorflow/lite/delegates/gpu/common:model_transformer",
        "//tensorflow/lite/delegates/gpu/common:operations",
        "//tensorflow/lite/delegates/gpu/common:status",
        "@com_google_absl//absl/strings",
    ],
 )
 cc_test(
    name = "fuse_add_to_conv_test",
    srcs = ["fuse_add_to_conv_test.cc"],
    deps = [
        ":fuse_add_to_conv",
        "//tensorflow/lite/delegates/gpu/common:operations",
        "//tensorflow/lite/delegates/gpu/common:shape",
        "@com_google_googletest//:gtest_main",
    ],
 )
 cc_library(
    name = "fuse_mul_to_conv",
    srcs = ["fuse_mul_to_conv.cc"],
    hdrs = ["fuse_mul_to_conv.h"],
    deps = [
        "//tensorflow/lite/delegates/gpu/common:data_type",
        "//tensorflow/lite/delegates/gpu/common:model",
        "//tensorflow/lite/delegates/gpu/common:model_transformer",
        "//tensorflow/lite/delegates/gpu/common:operations",
        "//tensorflow/lite/delegates/gpu/common:status",
        "//tensorflow/lite/delegates/gpu/common:tensor",
    ],
 )
 cc_test(
    name = "fuse_mul_to_conv_test",
    srcs = ["fuse_mul_to_conv_test.cc"],
    deps = [
        ":fuse_mul_to_conv",
        "//tensorflow/lite/delegates/gpu/common:model",
        "//tensorflow/lite/delegates/gpu/common:operations",
        "//tensorflow/lite/delegates/gpu/common:shape",
        "@com_google_googletest//:gtest_main",
    ],
 )
 cc_library(
    name = "general_transformations",
    srcs = ["general_transformations.cc"],
    hdrs = ["general_transformations.h"],
    deps = [
        ":fuse_add_to_conv",
        ":fuse_mul_to_conv",
        ":make_fully_connected",
        ":make_padding",
        ":merge_padding_with",
        ":remove_noop",
        "//tensorflow/lite/delegates/gpu/common:model_transformer",
    ],
 )
 cc_library(
    name = "make_fully_connected",
    srcs = ["make_fully_connected.cc"],
    hdrs = ["make_fully_connected.h"],
    deps = [
        "//tensorflow/lite/delegates/gpu/common:model",
        "//tensorflow/lite/delegates/gpu/common:model_transformer",
        "//tensorflow/lite/delegates/gpu/common:operations",
        "//tensorflow/lite/delegates/gpu/common:status",
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/types:any",
    ],
 )
 cc_test(
    name = "make_fully_connected_test",
    srcs = ["make_fully_connected_test.cc"],
    deps = [
        ":make_fully_connected",
        "//tensorflow/lite/delegates/gpu/common:model",
        "//tensorflow/lite/delegates/gpu/common:model_transformer",
        "//tensorflow/lite/delegates/gpu/common:operations",
        "//tensorflow/lite/delegates/gpu/common:shape",
        "@com_google_absl//absl/types:any",
        "@com_google_googletest//:gtest_main",
    ],
 )
 cc_library(
    name = "make_padding",
    srcs = ["make_padding.cc"],
    hdrs = ["make_padding.h"],
    deps = [
        "//tensorflow/lite/delegates/gpu/common:model",
        "//tensorflow/lite/delegates/gpu/common:model_transformer",
        "//tensorflow/lite/delegates/gpu/common:operations",
        "//tensorflow/lite/delegates/gpu/common:status",
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/types:any",
    ],
 )
 cc_test(
    name = "make_padding_test",
    srcs = ["make_padding_test.cc"],
    deps = [
        ":make_padding",
        "//tensorflow/lite/delegates/gpu/common:model",
        "//tensorflow/lite/delegates/gpu/common:model_transformer",
        "//tensorflow/lite/delegates/gpu/common:operations",
        "@com_google_absl//absl/types:any",
        "@com_google_googletest//:gtest_main",
    ],
 )
 cc_library(
    name = "matching",
    hdrs = ["matching.h"],
    deps = ["//tensorflow/lite/delegates/gpu/common:model"],
 )
 cc_library(
    name = "merge_padding_with",
    srcs = ["merge_padding_with.cc"],
    hdrs = ["merge_padding_with.h"],
    deps = [
        ":matching",
        "//tensorflow/lite/delegates/gpu/common:data_type",
        "//tensorflow/lite/delegates/gpu/common:model",
        "//tensorflow/lite/delegates/gpu/common:model_transformer",
        "//tensorflow/lite/delegates/gpu/common:operations",
        "//tensorflow/lite/delegates/gpu/common:status",
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/strings",
        "@com_google_absl//absl/types:any",
    ],
 )
 cc_test(
    name = "merge_padding_with_test",
    srcs = ["merge_padding_with_test.cc"],
    deps = [
        ":merge_padding_with",
        "//tensorflow/lite/delegates/gpu/common:model",
        "//tensorflow/lite/delegates/gpu/common:model_transformer",
        "//tensorflow/lite/delegates/gpu/common:operations",
        "//tensorflow/lite/delegates/gpu/common:shape",
        "@com_google_absl//absl/types:any",
        "@com_google_googletest//:gtest_main",
    ],
 )
 cc_library(
    name = "remove_noop",
    srcs = ["remove_noop.cc"],
    hdrs = ["remove_noop.h"],
    deps = [
        "//tensorflow/lite/delegates/gpu/common:data_type",
        "//tensorflow/lite/delegates/gpu/common:model",
        "//tensorflow/lite/delegates/gpu/common:model_transformer",
        "//tensorflow/lite/delegates/gpu/common:operations",
        "//tensorflow/lite/delegates/gpu/common:status",
        "@com_google_absl//absl/memory",
    ],
 )
 cc_test(
    name = "remove_noop_test",
    srcs = ["remove_noop_test.cc"],
    deps = [
        ":remove_noop",
        "//tensorflow/lite/delegates/gpu/common:data_type",
        "//tensorflow/lite/delegates/gpu/common:model",
        "//tensorflow/lite/delegates/gpu/common:model_transformer",
        "//tensorflow/lite/delegates/gpu/common:operations",
        "@com_google_googletest//:gtest_main",
    ],
 )
--- a/tensorflow/lite/delegates/gpu/common/transformations/add_bias.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/add_bias.cc
@ -0,0 +1,74 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/common/transformations/add_bias.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 namespace tflite {
 namespace gpu {
 namespace {
 template <typename T>
 TransformResult FillBias(Node* node) {
  auto& attr = absl::any_cast<T&>(node->operation.attributes);
  if (attr.bias.data.empty()) {
    const int dst_channels = attr.weights.shape.o;
    attr.bias = MakeZeroTensor<Linear, DataType::FLOAT32>(Linear(dst_channels));
    return {TransformStatus::APPLIED, "Added bias"};
  }
  return {TransformStatus::SKIPPED, ""};
 }
 template TransformResult FillBias<Convolution2DAttributes>(Node* node);
 template TransformResult FillBias<ConvolutionTransposedAttributes>(Node* node);
 template TransformResult FillBias<DepthwiseConvolution2DAttributes>(Node* node);
 template TransformResult FillBias<FullyConnectedAttributes>(Node* node);
 class AddBias : public NodeTransformation {
 public:
  TransformResult ApplyToNode(Node* node, GraphFloat32* graph) final {
    if (node->operation.type == ToString(OperationType::CONVOLUTION_2D)) {
      return FillBias<Convolution2DAttributes>(node);
    }
    if (node->operation.type ==
        ToString(OperationType::CONVOLUTION_TRANSPOSED)) {
      return FillBias<ConvolutionTransposedAttributes>(node);
    }
    if (node->operation.type ==
        ToString(OperationType::DEPTHWISE_CONVOLUTION)) {
      return FillBias<DepthwiseConvolution2DAttributes>(node);
    }
    if (node->operation.type == ToString(OperationType::FULLY_CONNECTED)) {
      return FillBias<FullyConnectedAttributes>(node);
    }
    return {TransformStatus::SKIPPED, ""};
  }
 };
 }  // namespace
 std::unique_ptr<NodeTransformation> NewAddBias() {
  return absl::make_unique<AddBias>();
 }
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/common/transformations/add_bias.h
+++ b/tensorflow/lite/delegates/gpu/common/transformations/add_bias.h
@ -0,0 +1,32 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_ADD_BIAS_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_ADD_BIAS_H_
 #include <memory>
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 namespace tflite {
 namespace gpu {
 // Makes optional bias(Conv/Deconv and etc) as not optional(always present)
 std::unique_ptr<NodeTransformation> NewAddBias();
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_ADD_BIAS_H_
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc
@ -0,0 +1,235 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 namespace tflite {
 namespace gpu {
 namespace {
 void FuseBiasWithAddAttributes(const AddAttributes& add_attr,
                               const int channels,
                               Tensor<Linear, DataType::FLOAT32>* bias) {
  auto add = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&add_attr.param);
  auto add_scalar = absl::get_if<float>(&add_attr.param);
  if (bias->data.empty()) {
    *bias = MakeZeroTensor<Linear, DataType::FLOAT32>(Linear(channels));
  }
  for (int d = 0; d < channels; ++d) {
    bias->data[d] += add ? add->data[d] : *add_scalar;
  }
 }
 class MergeConvolutionWithAdd : public SequenceTransformation {
 public:
  int ExpectedSequenceLength() const final { return 2; }
  TransformResult ApplyToNodesSequence(const std::vector<Node*>& sequence,
                                       GraphFloat32* graph) final {
    auto& conv_node = *sequence[0];
    auto& add_node = *sequence[1];
    if (add_node.operation.type != ToString(OperationType::ADD)) {
      return {TransformStatus::SKIPPED, ""};
    }
    AddAttributes add_attr =
        absl::any_cast<AddAttributes>(add_node.operation.attributes);
    if (!absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&add_attr.param) &&
        !absl::get_if<float>(&add_attr.param)) {
      return {TransformStatus::DECLINED,
              "This fuse applicable only for broadcast or scalar addition."};
    }
    if (conv_node.operation.type == ToString(OperationType::CONVOLUTION_2D)) {
      Convolution2DAttributes* conv_attr =
          absl::any_cast<Convolution2DAttributes>(
              &conv_node.operation.attributes);
      FuseConvolution2DWithAdd(add_attr, conv_attr);
    } else if (conv_node.operation.type ==
               ToString(OperationType::CONVOLUTION_TRANSPOSED)) {
      ConvolutionTransposedAttributes* conv_attr =
          absl::any_cast<ConvolutionTransposedAttributes>(
              &conv_node.operation.attributes);
      FuseConvolutionTransposedWithAdd(add_attr, conv_attr);
    } else if (conv_node.operation.type ==
               ToString(OperationType::DEPTHWISE_CONVOLUTION)) {
      DepthwiseConvolution2DAttributes* conv_attr =
          absl::any_cast<DepthwiseConvolution2DAttributes>(
              &conv_node.operation.attributes);
      FuseDepthwiseConvolution2DWithAdd(add_attr, conv_attr);
    } else if (conv_node.operation.type ==
               ToString(OperationType::FULLY_CONNECTED)) {
      FullyConnectedAttributes* conv_attr =
          absl::any_cast<FullyConnectedAttributes>(
              &conv_node.operation.attributes);
      FuseFullyConnectedWithAdd(add_attr, conv_attr);
    } else {
      return {TransformStatus::SKIPPED, ""};
    }
    Status status = RemoveFollowingNode(graph, &add_node, &conv_node);
    if (!status.ok()) {
      return {TransformStatus::INVALID,
              "Unable to remove add node after convolution: " +
                  status.error_message()};
    }
    return {TransformStatus::APPLIED, ""};
  }
 };
 class MergeAddWithConvolution : public SequenceTransformation {
 public:
  int ExpectedSequenceLength() const final { return 2; }
  TransformResult ApplyToNodesSequence(const std::vector<Node*>& sequence,
                                       GraphFloat32* graph) final {
    auto& conv_node = *sequence[1];
    auto& add_node = *sequence[0];
    if (add_node.operation.type != ToString(OperationType::ADD)) {
      return {TransformStatus::SKIPPED, ""};
    }
    AddAttributes add_attr =
        absl::any_cast<AddAttributes>(add_node.operation.attributes);
    if (!absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&add_attr.param) &&
        !absl::get_if<float>(&add_attr.param)) {
      return {TransformStatus::DECLINED,
              "This fuse applicable only for broadcast or scalar addition."};
    }
    if (conv_node.operation.type == ToString(OperationType::CONVOLUTION_2D)) {
      Convolution2DAttributes* conv_attr =
          absl::any_cast<Convolution2DAttributes>(
              &conv_node.operation.attributes);
      FuseAddWithConvolution2D(add_attr, conv_attr);
    } else if (conv_node.operation.type ==
               ToString(OperationType::DEPTHWISE_CONVOLUTION)) {
      DepthwiseConvolution2DAttributes* conv_attr =
          absl::any_cast<DepthwiseConvolution2DAttributes>(
              &conv_node.operation.attributes);
      FuseAddWithDepthwiseConvolution2D(add_attr, conv_attr);
    } else if (conv_node.operation.type ==
               ToString(OperationType::FULLY_CONNECTED)) {
      FullyConnectedAttributes* conv_attr =
          absl::any_cast<FullyConnectedAttributes>(
              &conv_node.operation.attributes);
      FuseAddWithFullyConnected(add_attr, conv_attr);
    } else {
      return {TransformStatus::SKIPPED, ""};
    }
    Status status = RemovePrecedingNode(graph, &add_node, &conv_node);
    if (!status.ok()) {
      return {TransformStatus::INVALID,
              "Unable to remove add node after convolution: " +
                  status.error_message()};
    }
    return {TransformStatus::APPLIED, ""};
  }
 };
 }  // namespace
 std::unique_ptr<SequenceTransformation> NewMergeConvolutionWithAdd() {
  return absl::make_unique<MergeConvolutionWithAdd>();
 }
 std::unique_ptr<SequenceTransformation> NewMergeAddWithConvolution() {
  return absl::make_unique<MergeAddWithConvolution>();
 }
 void FuseConvolution2DWithAdd(const AddAttributes& add_attr,
                              Convolution2DAttributes* attr) {
  FuseBiasWithAddAttributes(add_attr, attr->weights.shape.o, &attr->bias);
 }
 void FuseDepthwiseConvolution2DWithAdd(const AddAttributes& add_attr,
                                       DepthwiseConvolution2DAttributes* attr) {
  FuseBiasWithAddAttributes(
      add_attr, attr->weights.shape.o * attr->weights.shape.i, &attr->bias);
 }
 void FuseConvolutionTransposedWithAdd(const AddAttributes& add_attr,
                                      ConvolutionTransposedAttributes* attr) {
  FuseBiasWithAddAttributes(add_attr, attr->weights.shape.o, &attr->bias);
 }
 void FuseFullyConnectedWithAdd(const AddAttributes& add_attr,
                               FullyConnectedAttributes* attr) {
  FuseBiasWithAddAttributes(add_attr, attr->weights.shape.o, &attr->bias);
 }
 void FuseAddWithConvolution2D(const AddAttributes& add_attr,
                              Convolution2DAttributes* attr) {
  auto add = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&add_attr.param);
  auto add_scalar = absl::get_if<float>(&add_attr.param);
  if (attr->bias.data.empty()) {
    attr->bias = MakeZeroTensor<Linear, DataType::FLOAT32>(
        Linear(attr->weights.shape.o));
  }
  for (int d = 0; d < attr->weights.shape.o; ++d) {
    for (int s = 0; s < attr->weights.shape.i; ++s) {
      const float add_value = add ? add->data[s] : *add_scalar;
      for (int k_y = 0; k_y < attr->weights.shape.h; ++k_y) {
        for (int k_x = 0; k_x < attr->weights.shape.w; ++k_x) {
          const int index = attr->weights.shape.LinearIndex({d, k_y, k_x, s});
          attr->bias.data[d] += attr->weights.data[index] * add_value;
        }
      }
    }
  }
 }
 void FuseAddWithDepthwiseConvolution2D(const AddAttributes& add_attr,
                                       DepthwiseConvolution2DAttributes* attr) {
  auto add = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&add_attr.param);
  auto add_scalar = absl::get_if<float>(&add_attr.param);
  if (attr->bias.data.empty()) {
    attr->bias = MakeZeroTensor<Linear, DataType::FLOAT32>(
        Linear(attr->weights.shape.o * attr->weights.shape.i));
  }
  for (int s = 0; s < attr->weights.shape.i; ++s) {
    const float add_value = add ? add->data[s] : *add_scalar;
    for (int g = 0; g < attr->weights.shape.o; ++g) {
      const int d = s * attr->weights.shape.o + g;
      for (int k_y = 0; k_y < attr->weights.shape.h; ++k_y) {
        for (int k_x = 0; k_x < attr->weights.shape.w; ++k_x) {
          const int index = attr->weights.shape.LinearIndex({g, k_y, k_x, s});
          attr->bias.data[d] += attr->weights.data[index] * add_value;
        }
      }
    }
  }
 }
 void FuseAddWithFullyConnected(const AddAttributes& add_attr,
                               FullyConnectedAttributes* attr) {
  auto add = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&add_attr.param);
  auto add_scalar = absl::get_if<float>(&add_attr.param);
  if (attr->bias.data.empty()) {
    attr->bias = MakeZeroTensor<Linear, DataType::FLOAT32>(
        Linear(attr->weights.shape.o));
  }
  for (int d = 0; d < attr->weights.shape.o; ++d) {
    for (int s = 0; s < attr->weights.shape.i; ++s) {
      const float add_value = add ? add->data[s] : *add_scalar;
      const int index = attr->weights.shape.LinearIndex({d, 0, 0, s});
      attr->bias.data[d] += attr->weights.data[index] * add_value;
    }
  }
 }
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.h
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.h
@ -0,0 +1,83 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_FUSE_ADD_TO_CONV_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_FUSE_ADD_TO_CONV_H_
 #include <memory>
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 namespace tflite {
 namespace gpu {
 // Fuse Add Scalar or Add Broadcast after Convolution(Convolution2D,
 // DepthWise, TransposedConvolution, FullyConnected) into biases of
 // convolution.
 std::unique_ptr<SequenceTransformation> NewMergeConvolutionWithAdd();
 // Fuse Add Scalar or Add Broadcast before Convolution(Convolution2D,
 // DepthWise, FullyConnected) into biases of
 // convolution.
 std::unique_ptr<SequenceTransformation> NewMergeAddWithConvolution();
 // Modify Convolution2DAttributes so that after making convolution with
 // modified attributes we will have the same result as convolution
 // with old attributes and following add operation.
 void FuseConvolution2DWithAdd(const AddAttributes& add_attr,
                              Convolution2DAttributes* attr);
 // Modify DepthwiseConvolution2DAttributes so that after making depth wise
 // convolution with modified attributes we will have the same result as depth
 // wise convolution with old attributes and following add operation.
 void FuseDepthwiseConvolution2DWithAdd(const AddAttributes& add_attr,
                                       DepthwiseConvolution2DAttributes* attr);
 // Modify ConvolutionTransposedAttributes so that after making convolution
 // transposed with modified attributes we will have the same result as
 // convolution transposed with old attributes and following add operation.
 void FuseConvolutionTransposedWithAdd(const AddAttributes& add_attr,
                                      ConvolutionTransposedAttributes* attr);
 // Modify FullyConnectedAttributes so that after making fully connected with
 // modified attributes we will have the same result as fully connected
 // with old attributes and following add operation.
 void FuseFullyConnectedWithAdd(const AddAttributes& add_attr,
                               FullyConnectedAttributes* attr);
 // Modify Convolution2DAttributes so that after making convolution with
 // modified attributes we will have the same result as add operation and
 // convolution with old attributes
 void FuseAddWithConvolution2D(const AddAttributes& add_attr,
                              Convolution2DAttributes* attr);
 // Modify DepthwiseConvolution2DAttributes so that after making depth wise
 // convolution with modified attributes we will have the same result as add
 // operation and depth wise convolution with old attributes
 void FuseAddWithDepthwiseConvolution2D(const AddAttributes& add_attr,
                                       DepthwiseConvolution2DAttributes* attr);
 // Modify FullyConnectedAttributes so that after making fully connected
 // with modified attributes we will have the same result as add operation and
 // fully connected with old attributes
 void FuseAddWithFullyConnected(const AddAttributes& add_attr,
                               FullyConnectedAttributes* attr);
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_FUSE_ADD_TO_CONV_H_
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv_test.cc
@ -0,0 +1,281 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 using ::testing::FloatNear;
 using ::testing::Pointwise;
 namespace tflite {
 namespace gpu {
 namespace {
 TEST(MergeConvolutionWithAddTest, Smoke) {
  GraphFloat32 graph;
  auto input = graph.NewValue();
  input->tensor.shape = BHWC(1, 4, 4, 8);
  Convolution2DAttributes conv_attr;
  conv_attr.padding.prepended = HW(0, 0);
  conv_attr.padding.appended = HW(0, 0);
  conv_attr.strides = HW(1, 1);
  conv_attr.dilations = HW(1, 1);
  conv_attr.weights.shape = OHWI(16, 3, 2, 8);
  conv_attr.weights.data.resize(conv_attr.weights.shape.DimensionsProduct());
  conv_attr.bias.shape = Linear(16);
  conv_attr.bias.data.resize(16);
  Tensor<Linear, DataType::FLOAT32> add_tensor;
  add_tensor.shape = Linear(16);
  add_tensor.data.resize(16);
  AddAttributes add_attr;
  add_attr.param = add_tensor;
  auto conv_node = graph.NewNode();
  conv_node->operation.type = ToString(OperationType::CONVOLUTION_2D);
  conv_node->operation.attributes = conv_attr;
  auto add_node = graph.NewNode();
  add_node->operation.type = ToString(OperationType::ADD);
  add_node->operation.attributes = add_attr;
  ASSERT_TRUE(graph.AddConsumer(conv_node->id, input->id).ok());
  Value<TensorRefFloat32>* output;
  ASSERT_TRUE(AddOutput(&graph, add_node, &output).ok());
  output->tensor.shape = BHWC(1, 4, 4, 16);
  Value<TensorRefFloat32>* link1;
  ASSERT_TRUE(ConnectTwoNodes(&graph, conv_node, add_node, &link1).ok());
  link1->tensor.shape = BHWC(1, 4, 4, 16);
  ASSERT_EQ(2, graph.nodes().size());
  ASSERT_EQ(3, graph.values().size());
  auto transformation = NewMergeConvolutionWithAdd();
  ModelTransformer transformer(&graph, nullptr);
  transformer.Apply("merge_convolution_with_add", transformation.get());
  EXPECT_EQ(1, graph.nodes().size());
  EXPECT_EQ(2, graph.values().size());
  EXPECT_EQ(ToString(OperationType::CONVOLUTION_2D),
            graph.nodes()[0]->operation.type);
 }
 TEST(MergeAddWithConvolutionTest, Smoke) {
  GraphFloat32 graph;
  auto input = graph.NewValue();
  input->tensor.shape = BHWC(1, 4, 4, 8);
  Convolution2DAttributes conv_attr;
  conv_attr.padding.prepended = HW(0, 0);
  conv_attr.padding.appended = HW(0, 0);
  conv_attr.strides = HW(1, 1);
  conv_attr.dilations = HW(1, 1);
  conv_attr.weights.shape = OHWI(16, 3, 2, 8);
  conv_attr.weights.data.resize(conv_attr.weights.shape.DimensionsProduct());
  conv_attr.bias.shape = Linear(16);
  conv_attr.bias.data.resize(16);
  Tensor<Linear, DataType::FLOAT32> add_tensor;
  add_tensor.shape = Linear(8);
  add_tensor.data.resize(8);
  AddAttributes add_attr;
  add_attr.param = add_tensor;
  auto conv_node = graph.NewNode();
  conv_node->operation.type = ToString(OperationType::CONVOLUTION_2D);
  conv_node->operation.attributes = conv_attr;
  auto add_node = graph.NewNode();
  add_node->operation.type = ToString(OperationType::ADD);
  add_node->operation.attributes = add_attr;
  ASSERT_TRUE(graph.AddConsumer(add_node->id, input->id).ok());
  Value<TensorRefFloat32>* output;
  ASSERT_TRUE(AddOutput(&graph, conv_node, &output).ok());
  output->tensor.shape = BHWC(1, 4, 4, 16);
  Value<TensorRefFloat32>* link1;
  ASSERT_TRUE(ConnectTwoNodes(&graph, add_node, conv_node, &link1).ok());
  link1->tensor.shape = BHWC(1, 4, 4, 16);
  ASSERT_EQ(2, graph.nodes().size());
  ASSERT_EQ(3, graph.values().size());
  auto transformation = NewMergeAddWithConvolution();
  ModelTransformer transformer(&graph, nullptr);
  transformer.Apply("merge_add_with_convolution", transformation.get());
  EXPECT_EQ(1, graph.nodes().size());
  EXPECT_EQ(2, graph.values().size());
  EXPECT_EQ(ToString(OperationType::CONVOLUTION_2D),
            graph.nodes()[0]->operation.type);
 }
 TEST(FuseAddAfterConvolution2DTest, Smoke) {
  Convolution2DAttributes attr;
  attr.weights.shape = OHWI(2, 1, 2, 2);
  attr.weights.data = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f};
  attr.bias.shape = Linear(2);
  attr.bias.data = {1.1f, 1.2f};
  Tensor<Linear, DataType::FLOAT32> add_tensor;
  add_tensor.shape = Linear(2);
  add_tensor.data = {0.3f, 0.7f};
  AddAttributes add_attr;
  add_attr.param = add_tensor;
  FuseConvolution2DWithAdd(add_attr, &attr);
  EXPECT_THAT(attr.weights.data,
              Pointwise(FloatNear(1e-6),
                        {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f}));
  EXPECT_THAT(attr.bias.data, Pointwise(FloatNear(1e-6), {1.4f, 1.9f}));
 }
 TEST(FuseAddAfterDepthwiseConvolution2DTest, Smoke) {
  DepthwiseConvolution2DAttributes attr;
  attr.weights.shape = OHWI(2, 1, 2, 2);
  attr.weights.data = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f};
  attr.bias.shape = Linear(4);
  attr.bias.data = {1.1f, 1.2f, 1.3f, 1.4f};
  Tensor<Linear, DataType::FLOAT32> add_tensor;
  add_tensor.shape = Linear(4);
  add_tensor.data = {0.3f, 0.7f, 0.5f, 0.1f};
  AddAttributes add_attr;
  add_attr.param = add_tensor;
  FuseDepthwiseConvolution2DWithAdd(add_attr, &attr);
  EXPECT_THAT(attr.weights.data,
              Pointwise(FloatNear(1e-6),
                        {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f}));
  EXPECT_THAT(attr.bias.data,
              Pointwise(FloatNear(1e-6), {1.4f, 1.9f, 1.8f, 1.5f}));
 }
 TEST(FuseAddAfterConvolutionTransposedTest, Smoke) {
  ConvolutionTransposedAttributes attr;
  attr.weights.shape = OHWI(2, 1, 2, 2);
  attr.weights.data = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f};
  attr.bias.shape = Linear(2);
  attr.bias.data = {1.1f, 1.2f};
  Tensor<Linear, DataType::FLOAT32> add_tensor;
  add_tensor.shape = Linear(2);
  add_tensor.data = {0.3f, 0.7f};
  AddAttributes add_attr;
  add_attr.param = add_tensor;
  FuseConvolutionTransposedWithAdd(add_attr, &attr);
  EXPECT_THAT(attr.weights.data,
              Pointwise(FloatNear(1e-6),
                        {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f}));
  EXPECT_THAT(attr.bias.data, Pointwise(FloatNear(1e-6), {1.4f, 1.9f}));
 }
 TEST(FuseAddAfterFullyConnectedTest, Smoke) {
  FullyConnectedAttributes attr;
  attr.weights.shape = OHWI(2, 1, 1, 2);
  attr.weights.data = {0.1f, 0.2f, 0.3f, 0.4f};
  attr.bias.shape = Linear(2);
  attr.bias.data = {1.1f, 1.2f};
  Tensor<Linear, DataType::FLOAT32> add_tensor;
  add_tensor.shape = Linear(2);
  add_tensor.data = {0.3f, 0.7f};
  AddAttributes add_attr;
  add_attr.param = add_tensor;
  FuseFullyConnectedWithAdd(add_attr, &attr);
  EXPECT_THAT(attr.weights.data,
              Pointwise(FloatNear(1e-6), {0.1f, 0.2f, 0.3f, 0.4f}));
  EXPECT_THAT(attr.bias.data, Pointwise(FloatNear(1e-6), {1.4f, 1.9f}));
 }
 TEST(FuseAddBeforeConvolution2DTest, Smoke) {
  Convolution2DAttributes attr;
  attr.weights.shape = OHWI(2, 1, 2, 2);
  attr.weights.data = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f};
  attr.bias.shape = Linear(2);
  attr.bias.data = {1.1f, 1.2f};
  Tensor<Linear, DataType::FLOAT32> add_tensor;
  add_tensor.shape = Linear(2);
  add_tensor.data = {2.0f, 0.5f};
  AddAttributes add_attr;
  add_attr.param = add_tensor;
  FuseAddWithConvolution2D(add_attr, &attr);
  EXPECT_THAT(attr.weights.data,
              Pointwise(FloatNear(1e-6),
                        {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f}));
  EXPECT_THAT(attr.bias.data, Pointwise(FloatNear(1e-6), {2.2f, 4.3f}));
 }
 TEST(FuseAddBeforeDepthwiseConvolution2DTest, Smoke) {
  DepthwiseConvolution2DAttributes attr;
  attr.weights.shape = OHWI(2, 1, 2, 2);
  attr.weights.data = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f};
  attr.bias.shape = Linear(4);
  attr.bias.data = {1.1f, 1.2f, 1.3f, 1.4f};
  Tensor<Linear, DataType::FLOAT32> add_tensor;
  add_tensor.shape = Linear(4);
  add_tensor.data = {0.3f, 0.7f, 0.5f, 0.1f};
  AddAttributes add_attr;
  add_attr.param = add_tensor;
  FuseAddWithDepthwiseConvolution2D(add_attr, &attr);
  EXPECT_THAT(attr.weights.data,
              Pointwise(FloatNear(1e-6),
                        {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f}));
  EXPECT_THAT(attr.bias.data,
              Pointwise(FloatNear(1e-6), {1.22f, 1.56f, 1.72f, 2.38f}));
 }
 TEST(FuseAddBeforeFullyConnectedTest, Smoke) {
  FullyConnectedAttributes attr;
  attr.weights.shape = OHWI(2, 1, 1, 2);
  attr.weights.data = {0.1f, 0.2f, 0.3f, 0.4f};
  attr.bias.shape = Linear(2);
  attr.bias.data = {1.1f, 1.2f};
  Tensor<Linear, DataType::FLOAT32> add_tensor;
  add_tensor.shape = Linear(2);
  add_tensor.data = {0.5f, 2.0f};
  AddAttributes add_attr;
  add_attr.param = add_tensor;
  FuseAddWithFullyConnected(add_attr, &attr);
  EXPECT_THAT(attr.weights.data,
              Pointwise(FloatNear(1e-6), {0.1f, 0.2f, 0.3f, 0.4f}));
  EXPECT_THAT(attr.bias.data, Pointwise(FloatNear(1e-6), {1.55f, 2.15f}));
 }
 }  // namespace
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc
@ -0,0 +1,304 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
 namespace tflite {
 namespace gpu {
 namespace {
 class MergeConvolutionWithMul : public SequenceTransformation {
 public:
  int ExpectedSequenceLength() const final { return 2; }
  TransformResult ApplyToNodesSequence(const std::vector<Node*>& sequence,
                                       GraphFloat32* graph) final {
    auto& conv_node = *sequence[0];
    auto& mul_node = *sequence[1];
    if (mul_node.operation.type != ToString(OperationType::MUL) &&
        mul_node.operation.type != ToString(OperationType::MULTIPLY_SCALAR)) {
      return {TransformStatus::SKIPPED, ""};
    }
    MultiplyScalarAttributes mul_attr =
        absl::any_cast<MultiplyScalarAttributes>(mul_node.operation.attributes);
    if (!absl::get_if<Tensor<Linear, DataType::FLOAT32>>(
            &mul_attr.param) &&
        !absl::get_if<float>(&mul_attr.param)) {
      return {
          TransformStatus::DECLINED,
          "This fuse applicable only for broadcast or scalar multiplication."};
    }
    if (conv_node.operation.type == ToString(OperationType::CONVOLUTION_2D)) {
      Convolution2DAttributes* conv_attr =
          absl::any_cast<Convolution2DAttributes>(
              &conv_node.operation.attributes);
      FuseConvolution2DWithMultiply(mul_attr, conv_attr);
    } else if (conv_node.operation.type ==
               ToString(OperationType::CONVOLUTION_TRANSPOSED)) {
      ConvolutionTransposedAttributes* conv_attr =
          absl::any_cast<ConvolutionTransposedAttributes>(
              &conv_node.operation.attributes);
      FuseConvolutionTransposedWithMultiply(mul_attr, conv_attr);
    } else if (conv_node.operation.type ==
               ToString(OperationType::DEPTHWISE_CONVOLUTION)) {
      DepthwiseConvolution2DAttributes* conv_attr =
          absl::any_cast<DepthwiseConvolution2DAttributes>(
              &conv_node.operation.attributes);
      FuseDepthwiseConvolution2DWithMultiply(mul_attr, conv_attr);
    } else if (conv_node.operation.type ==
               ToString(OperationType::FULLY_CONNECTED)) {
      FullyConnectedAttributes* conv_attr =
          absl::any_cast<FullyConnectedAttributes>(
              &conv_node.operation.attributes);
      FuseFullyConnectedWithMultiply(mul_attr, conv_attr);
    } else {
      return {TransformStatus::SKIPPED, ""};
    }
    Status status = RemoveFollowingNode(graph, &mul_node, &conv_node);
    if (!status.ok()) {
      return {TransformStatus::INVALID,
              "Unable to remove mul node after convolution: " +
                  status.error_message()};
    }
    return {TransformStatus::APPLIED, ""};
  }
 };
 class MergeMulWithConvolution : public SequenceTransformation {
 public:
  int ExpectedSequenceLength() const final { return 2; }
  TransformResult ApplyToNodesSequence(const std::vector<Node*>& sequence,
                                       GraphFloat32* graph) final {
    auto& conv_node = *sequence[1];
    auto& mul_node = *sequence[0];
    if (mul_node.operation.type != ToString(OperationType::MUL) &&
        mul_node.operation.type != ToString(OperationType::MULTIPLY_SCALAR)) {
      return {TransformStatus::SKIPPED, ""};
    }
    MultiplyScalarAttributes mul_attr =
        absl::any_cast<MultiplyScalarAttributes>(mul_node.operation.attributes);
    if (!absl::get_if<Tensor<Linear, DataType::FLOAT32>>(
            &mul_attr.param) &&
        !absl::get_if<float>(&mul_attr.param)) {
      return {
          TransformStatus::DECLINED,
          "This fuse applicable only for broadcast or scalar multiplication."};
    }
    if (conv_node.operation.type == ToString(OperationType::CONVOLUTION_2D)) {
      Convolution2DAttributes* conv_attr =
          absl::any_cast<Convolution2DAttributes>(
              &conv_node.operation.attributes);
      FuseMultiplyWithConvolution2D(mul_attr, conv_attr);
    } else if (conv_node.operation.type ==
               ToString(OperationType::CONVOLUTION_TRANSPOSED)) {
      ConvolutionTransposedAttributes* conv_attr =
          absl::any_cast<ConvolutionTransposedAttributes>(
              &conv_node.operation.attributes);
      FuseMultiplyWithConvolutionTransposed(mul_attr, conv_attr);
    } else if (conv_node.operation.type ==
               ToString(OperationType::DEPTHWISE_CONVOLUTION)) {
      DepthwiseConvolution2DAttributes* conv_attr =
          absl::any_cast<DepthwiseConvolution2DAttributes>(
              &conv_node.operation.attributes);
      FuseMultiplyWithDepthwiseConvolution2D(mul_attr, conv_attr);
    } else if (conv_node.operation.type ==
               ToString(OperationType::FULLY_CONNECTED)) {
      FullyConnectedAttributes* conv_attr =
          absl::any_cast<FullyConnectedAttributes>(
              &conv_node.operation.attributes);
      FuseMultiplyWithFullyConnected(mul_attr, conv_attr);
    } else {
      return {TransformStatus::SKIPPED, ""};
    }
    Status status = RemovePrecedingNode(graph, &mul_node, &conv_node);
    if (!status.ok()) {
      return {TransformStatus::INVALID,
              "Unable to remove mul node after convolution: " +
                  status.error_message()};
    }
    return {TransformStatus::APPLIED, ""};
  }
 };
 }  // namespace
 std::unique_ptr<SequenceTransformation> NewMergeConvolutionWithMul() {
  return absl::make_unique<MergeConvolutionWithMul>();
 }
 std::unique_ptr<SequenceTransformation> NewMergeMulWithConvolution() {
  return absl::make_unique<MergeMulWithConvolution>();
 }
 void FuseConvolution2DWithMultiply(const MultiplyScalarAttributes& mul_attr,
                                   Convolution2DAttributes* attr) {
  auto mul = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&mul_attr.param);
  auto mul_scalar = absl::get_if<float>(&mul_attr.param);
  for (int d = 0; d < attr->weights.shape.o; ++d) {
    const float multiplier = mul ? mul->data[d] : *mul_scalar;
    for (int s = 0; s < attr->weights.shape.i; ++s) {
      for (int k_y = 0; k_y < attr->weights.shape.h; ++k_y) {
        for (int k_x = 0; k_x < attr->weights.shape.w; ++k_x) {
          const int index = attr->weights.shape.LinearIndex({d, k_y, k_x, s});
          attr->weights.data[index] *= multiplier;
        }
      }
    }
    if (!attr->bias.data.empty()) {
      attr->bias.data[d] *= multiplier;
    }
  }
 }
 void FuseDepthwiseConvolution2DWithMultiply(
    const MultiplyScalarAttributes& mul_attr,
    DepthwiseConvolution2DAttributes* attr) {
  auto mul = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&mul_attr.param);
  auto mul_scalar = absl::get_if<float>(&mul_attr.param);
  for (int g = 0; g < attr->weights.shape.o; ++g) {
    for (int s = 0; s < attr->weights.shape.i; ++s) {
      const int d = s * attr->weights.shape.o + g;
      const float multiplier = mul ? mul->data[d] : *mul_scalar;
      for (int k_y = 0; k_y < attr->weights.shape.h; ++k_y) {
        for (int k_x = 0; k_x < attr->weights.shape.w; ++k_x) {
          const int index = attr->weights.shape.LinearIndex({g, k_y, k_x, s});
          attr->weights.data[index] *= multiplier;
        }
      }
      if (!attr->bias.data.empty()) {
        attr->bias.data[d] *= multiplier;
      }
    }
  }
 }
 void FuseConvolutionTransposedWithMultiply(
    const MultiplyScalarAttributes& mul_attr,
    ConvolutionTransposedAttributes* attr) {
  auto mul = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&mul_attr.param);
  auto mul_scalar = absl::get_if<float>(&mul_attr.param);
  for (int d = 0; d < attr->weights.shape.o; ++d) {
    const float multiplier = mul ? mul->data[d] : *mul_scalar;
    for (int s = 0; s < attr->weights.shape.i; ++s) {
      for (int k_y = 0; k_y < attr->weights.shape.h; ++k_y) {
        for (int k_x = 0; k_x < attr->weights.shape.w; ++k_x) {
          const int index = attr->weights.shape.LinearIndex({d, k_y, k_x, s});
          attr->weights.data[index] *= multiplier;
        }
      }
    }
    if (!attr->bias.data.empty()) {
      attr->bias.data[d] *= multiplier;
    }
  }
 }
 void FuseFullyConnectedWithMultiply(const MultiplyScalarAttributes& mul_attr,
                                    FullyConnectedAttributes* attr) {
  auto mul = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&mul_attr.param);
  auto mul_scalar = absl::get_if<float>(&mul_attr.param);
  for (int d = 0; d < attr->weights.shape.o; ++d) {
    const float multiplier = mul ? mul->data[d] : *mul_scalar;
    for (int s = 0; s < attr->weights.shape.i; ++s) {
      const int index = attr->weights.shape.LinearIndex({d, 0, 0, s});
      attr->weights.data[index] *= multiplier;
    }
    if (!attr->bias.data.empty()) {
      attr->bias.data[d] *= multiplier;
    }
  }
 }
 void FuseMultiplyWithConvolution2D(const MultiplyScalarAttributes& mul_attr,
                                   Convolution2DAttributes* attr) {
  auto mul = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&mul_attr.param);
  auto mul_scalar = absl::get_if<float>(&mul_attr.param);
  for (int s = 0; s < attr->weights.shape.i; ++s) {
    const float multiplier = mul ? mul->data[s] : *mul_scalar;
    for (int d = 0; d < attr->weights.shape.o; ++d) {
      for (int k_y = 0; k_y < attr->weights.shape.h; ++k_y) {
        for (int k_x = 0; k_x < attr->weights.shape.w; ++k_x) {
          const int index = attr->weights.shape.LinearIndex({d, k_y, k_x, s});
          attr->weights.data[index] *= multiplier;
        }
      }
    }
  }
 }
 void FuseMultiplyWithDepthwiseConvolution2D(
    const MultiplyScalarAttributes& mul_attr,
    DepthwiseConvolution2DAttributes* attr) {
  auto mul = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&mul_attr.param);
  auto mul_scalar = absl::get_if<float>(&mul_attr.param);
  for (int s = 0; s < attr->weights.shape.i; ++s) {
    const float multiplier = mul ? mul->data[s] : *mul_scalar;
    for (int g = 0; g < attr->weights.shape.o; ++g) {
      for (int k_y = 0; k_y < attr->weights.shape.h; ++k_y) {
        for (int k_x = 0; k_x < attr->weights.shape.w; ++k_x) {
          const int index = attr->weights.shape.LinearIndex({g, k_y, k_x, s});
          attr->weights.data[index] *= multiplier;
        }
      }
    }
  }
 }
 void FuseMultiplyWithConvolutionTransposed(
    const MultiplyScalarAttributes& mul_attr,
    ConvolutionTransposedAttributes* attr) {
  auto mul = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&mul_attr.param);
  auto mul_scalar = absl::get_if<float>(&mul_attr.param);
  for (int s = 0; s < attr->weights.shape.i; ++s) {
    const float multiplier = mul ? mul->data[s] : *mul_scalar;
    for (int d = 0; d < attr->weights.shape.o; ++d) {
      for (int k_y = 0; k_y < attr->weights.shape.h; ++k_y) {
        for (int k_x = 0; k_x < attr->weights.shape.w; ++k_x) {
          const int index = attr->weights.shape.LinearIndex({d, k_y, k_x, s});
          attr->weights.data[index] *= multiplier;
        }
      }
    }
  }
 }
 void FuseMultiplyWithFullyConnected(const MultiplyScalarAttributes& mul_attr,
                                    FullyConnectedAttributes* attr) {
  auto mul = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&mul_attr.param);
  auto mul_scalar = absl::get_if<float>(&mul_attr.param);
  for (int s = 0; s < attr->weights.shape.i; ++s) {
    const float multiplier = mul ? mul->data[s] : *mul_scalar;
    for (int d = 0; d < attr->weights.shape.o; ++d) {
      const int index = attr->weights.shape.LinearIndex({d, 0, 0, s});
      attr->weights.data[index] *= multiplier;
    }
  }
 }
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.h
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.h
@ -0,0 +1,93 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_FUSE_MUL_TO_CONV_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_FUSE_MUL_TO_CONV_H_
 #include <memory>
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 namespace tflite {
 namespace gpu {
 // Fuse Multiply Scalar or Multiply Broadcast after Convolution(Convolution2D,
 // DepthWise, TransposedConvolution, FullyConnected) into weights and biases of
 // convolution.
 std::unique_ptr<SequenceTransformation> NewMergeConvolutionWithMul();
 // Fuse Multiply Scalar or Multiply Broadcast before Convolution(Convolution2D,
 // DepthWise, TransposedConvolution, FullyConnected) into weights and biases of
 // convolution.
 std::unique_ptr<SequenceTransformation> NewMergeMulWithConvolution();
 // Modify Convolution2DAttributes so that after making convolution with
 // modified attributes we will have the same result as convolution
 // with old attributes and following multiply operation.
 void FuseConvolution2DWithMultiply(const MultiplyScalarAttributes& mul_attr,
                                   Convolution2DAttributes* attr);
 // Modify DepthwiseConvolution2DAttributes so that after making depth wise
 // convolution with modified attributes we will have the same result as depth
 // wise convolution with old attributes and following multiply operation.
 void FuseDepthwiseConvolution2DWithMultiply(
    const MultiplyScalarAttributes& mul_attr,
    DepthwiseConvolution2DAttributes* attr);
 // Modify ConvolutionTransposedAttributes so that after making convolution
 // transposed with modified attributes we will have the same result as
 // convolution transposed with old attributes and following multiply operation.
 void FuseConvolutionTransposedWithMultiply(
    const MultiplyScalarAttributes& mul_attr,
    ConvolutionTransposedAttributes* attr);
 // Modify FullyConnectedAttributes so that after making fully connected with
 // modified attributes we will have the same result as fully connected
 // with old attributes and following multiply operation.
 void FuseFullyConnectedWithMultiply(const MultiplyScalarAttributes& mul_attr,
                                    FullyConnectedAttributes* attr);
 // Modify Convolution2DAttributes so that after making convolution with
 // modified attributes we will have the same result as multiply operation and
 // convolution with old attributes
 void FuseMultiplyWithConvolution2D(const MultiplyScalarAttributes& mul_attr,
                                   Convolution2DAttributes* attr);
 // Modify DepthwiseConvolution2DAttributes so that after making depth wise
 // convolution with modified attributes we will have the same result as multiply
 // operation and depth wise convolution with old attributes
 void FuseMultiplyWithDepthwiseConvolution2D(
    const MultiplyScalarAttributes& mul_attr,
    DepthwiseConvolution2DAttributes* attr);
 // Modify ConvolutionTransposedAttributes so that after making convolution
 // transposed with modified attributes we will have the same result as multiply
 // operation and convolution transposed with old attributes
 void FuseMultiplyWithConvolutionTransposed(
    const MultiplyScalarAttributes& mul_attr,
    ConvolutionTransposedAttributes* attr);
 // Modify FullyConnectedAttributes so that after making fully connected
 // with modified attributes we will have the same result as multiply
 // operation and fully connected with old attributes
 void FuseMultiplyWithFullyConnected(const MultiplyScalarAttributes& mul_attr,
                                    FullyConnectedAttributes* attr);
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_FUSE_MUL_TO_CONV_H_
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv_test.cc
@ -0,0 +1,303 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 using ::testing::FloatNear;
 using ::testing::Pointwise;
 namespace tflite {
 namespace gpu {
 namespace {
 TEST(MergeConvolutionWithMulTest, Smoke) {
  GraphFloat32 graph;
  auto input = graph.NewValue();
  input->tensor.shape = BHWC(1, 4, 4, 8);
  Convolution2DAttributes conv_attr;
  conv_attr.padding.prepended = HW(0, 0);
  conv_attr.padding.appended = HW(0, 0);
  conv_attr.strides = HW(1, 1);
  conv_attr.dilations = HW(1, 1);
  conv_attr.weights.shape = OHWI(16, 3, 2, 8);
  conv_attr.weights.data.resize(conv_attr.weights.shape.DimensionsProduct());
  conv_attr.bias.shape = Linear(16);
  conv_attr.bias.data.resize(16);
  Tensor<Linear, DataType::FLOAT32> mul_tensor;
  mul_tensor.shape = Linear(16);
  mul_tensor.data.resize(16);
  MultiplyScalarAttributes mul_attr;
  mul_attr.param = mul_tensor;
  auto conv_node = graph.NewNode();
  conv_node->operation.type = ToString(OperationType::CONVOLUTION_2D);
  conv_node->operation.attributes = conv_attr;
  auto mul_node = graph.NewNode();
  mul_node->operation.type = ToString(OperationType::MUL);
  mul_node->operation.attributes = mul_attr;
  ASSERT_TRUE(graph.AddConsumer(conv_node->id, input->id).ok());
  Value<TensorRefFloat32>* output;
  ASSERT_TRUE(AddOutput(&graph, mul_node, &output).ok());
  output->tensor.shape = BHWC(1, 4, 4, 16);
  Value<TensorRefFloat32>* link1;
  ASSERT_TRUE(ConnectTwoNodes(&graph, conv_node, mul_node, &link1).ok());
  link1->tensor.shape = BHWC(1, 4, 4, 16);
  ASSERT_EQ(2, graph.nodes().size());
  ASSERT_EQ(3, graph.values().size());
  auto transformation = NewMergeConvolutionWithMul();
  ModelTransformer transformer(&graph, nullptr);
  transformer.Apply("merge_convolution_with_mul", transformation.get());
  EXPECT_EQ(1, graph.nodes().size());
  EXPECT_EQ(2, graph.values().size());
  EXPECT_EQ(ToString(OperationType::CONVOLUTION_2D),
            graph.nodes()[0]->operation.type);
 }
 TEST(MergeMulWithConvolutionTest, Smoke) {
  GraphFloat32 graph;
  auto input = graph.NewValue();
  input->tensor.shape = BHWC(1, 4, 4, 8);
  Tensor<Linear, DataType::FLOAT32> mul_tensor;
  mul_tensor.shape = Linear(8);
  mul_tensor.data.resize(8);
  MultiplyScalarAttributes mul_attr;
  mul_attr.param = mul_tensor;
  Convolution2DAttributes conv_attr;
  conv_attr.padding.prepended = HW(0, 0);
  conv_attr.padding.appended = HW(0, 0);
  conv_attr.strides = HW(1, 1);
  conv_attr.dilations = HW(1, 1);
  conv_attr.weights.shape = OHWI(16, 3, 2, 8);
  conv_attr.weights.data.resize(conv_attr.weights.shape.DimensionsProduct());
  conv_attr.bias.shape = Linear(16);
  conv_attr.bias.data.resize(16);
  auto conv_node = graph.NewNode();
  conv_node->operation.type = ToString(OperationType::CONVOLUTION_2D);
  conv_node->operation.attributes = conv_attr;
  auto mul_node = graph.NewNode();
  mul_node->operation.type = ToString(OperationType::MUL);
  mul_node->operation.attributes = mul_attr;
  ASSERT_TRUE(graph.AddConsumer(mul_node->id, input->id).ok());
  Value<TensorRefFloat32>* output;
  ASSERT_TRUE(AddOutput(&graph, conv_node, &output).ok());
  output->tensor.shape = BHWC(1, 4, 4, 16);
  Value<TensorRefFloat32>* link1;
  ASSERT_TRUE(ConnectTwoNodes(&graph, mul_node, conv_node, &link1).ok());
  link1->tensor.shape = BHWC(1, 4, 4, 16);
  ASSERT_EQ(2, graph.nodes().size());
  ASSERT_EQ(3, graph.values().size());
  auto transformation = NewMergeMulWithConvolution();
  ModelTransformer transformer(&graph, nullptr);
  transformer.Apply("merge_mul_with_convolution", transformation.get());
  EXPECT_EQ(1, graph.nodes().size());
  EXPECT_EQ(2, graph.values().size());
  EXPECT_EQ(ToString(OperationType::CONVOLUTION_2D),
            graph.nodes()[0]->operation.type);
 }
 TEST(FuseMulAfterConvolution2DTest, Smoke) {
  Convolution2DAttributes attr;
  attr.weights.shape = OHWI(2, 1, 2, 2);
  attr.weights.data = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f};
  attr.bias.shape = Linear(2);
  attr.bias.data = {1.5f, 2.5f};
  Tensor<Linear, DataType::FLOAT32> mul_tensor;
  mul_tensor.shape = Linear(2);
  mul_tensor.data = {0.5f, 2.0f};
  MultiplyScalarAttributes mul_attr;
  mul_attr.param = mul_tensor;
  FuseConvolution2DWithMultiply(mul_attr, &attr);
  EXPECT_THAT(attr.weights.data,
              Pointwise(FloatNear(1e-6),
                        {0.05f, 0.1f, 0.15f, 0.2f, 1.0f, 1.2f, 1.4f, 1.6f}));
  EXPECT_THAT(attr.bias.data, Pointwise(FloatNear(1e-6), {0.75f, 5.0f}));
 }
 TEST(FuseMulAfterDepthwiseConvolution2DTest, Smoke) {
  DepthwiseConvolution2DAttributes attr;
  attr.weights.shape = OHWI(2, 1, 2, 2);
  attr.weights.data = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f};
  attr.bias.shape = Linear(4);
  attr.bias.data = {1.5f, 2.5f, 1.0f, 2.0f};
  Tensor<Linear, DataType::FLOAT32> mul_tensor;
  mul_tensor.shape = Linear(4);
  mul_tensor.data = {0.5f, 2.0f, 4.0f, 0.25f};
  MultiplyScalarAttributes mul_attr;
  mul_attr.param = mul_tensor;
  FuseDepthwiseConvolution2DWithMultiply(mul_attr, &attr);
  EXPECT_THAT(attr.weights.data,
              Pointwise(FloatNear(1e-6),
                        {0.05f, 0.8f, 0.15f, 1.6f, 1.0f, 0.15f, 1.4f, 0.2f}));
  EXPECT_THAT(attr.bias.data,
              Pointwise(FloatNear(1e-6), {0.75f, 5.0f, 4.0f, 0.5f}));
 }
 TEST(FuseMulAfterConvolutionTransposedTest, Smoke) {
  ConvolutionTransposedAttributes attr;
  attr.weights.shape = OHWI(2, 1, 2, 2);
  attr.weights.data = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f};
  attr.bias.shape = Linear(2);
  attr.bias.data = {1.5f, 2.5f};
  Tensor<Linear, DataType::FLOAT32> mul_tensor;
  mul_tensor.shape = Linear(2);
  mul_tensor.data = {0.5f, 2.0f};
  MultiplyScalarAttributes mul_attr;
  mul_attr.param = mul_tensor;
  FuseConvolutionTransposedWithMultiply(mul_attr, &attr);
  EXPECT_THAT(attr.weights.data,
              Pointwise(FloatNear(1e-6),
                        {0.05f, 0.1f, 0.15f, 0.2f, 1.0f, 1.2f, 1.4f, 1.6f}));
  EXPECT_THAT(attr.bias.data, Pointwise(FloatNear(1e-6), {0.75f, 5.0f}));
 }
 TEST(FuseMulAfterFullyConnectedTest, Smoke) {
  FullyConnectedAttributes attr;
  attr.weights.shape = OHWI(2, 1, 1, 2);
  attr.weights.data = {0.1f, 0.2f, 0.3f, 0.4f};
  attr.bias.shape = Linear(2);
  attr.bias.data = {1.5f, 2.5f};
  Tensor<Linear, DataType::FLOAT32> mul_tensor;
  mul_tensor.shape = Linear(2);
  mul_tensor.data = {0.5f, 2.0f};
  MultiplyScalarAttributes mul_attr;
  mul_attr.param = mul_tensor;
  FuseFullyConnectedWithMultiply(mul_attr, &attr);
  EXPECT_THAT(attr.weights.data,
              Pointwise(FloatNear(1e-6), {0.05f, 0.1f, 0.6f, 0.8f}));
  EXPECT_THAT(attr.bias.data, Pointwise(FloatNear(1e-6), {0.75f, 5.0f}));
 }
 TEST(FuseMulBeforeConvolution2DTest, Smoke) {
  Convolution2DAttributes attr;
  attr.weights.shape = OHWI(2, 1, 2, 2);
  attr.weights.data = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f};
  attr.bias.shape = Linear(2);
  attr.bias.data = {1.5f, 2.5f};
  Tensor<Linear, DataType::FLOAT32> mul_tensor;
  mul_tensor.shape = Linear(2);
  mul_tensor.data = {0.5f, 2.0f};
  MultiplyScalarAttributes mul_attr;
  mul_attr.param = mul_tensor;
  FuseMultiplyWithConvolution2D(mul_attr, &attr);
  EXPECT_THAT(attr.weights.data,
              Pointwise(FloatNear(1e-6),
                        {0.05f, 0.4f, 0.15f, 0.8f, 0.25f, 1.2f, 0.35f, 1.6f}));
  EXPECT_THAT(attr.bias.data, Pointwise(FloatNear(1e-6), {1.5f, 2.5f}));
 }
 TEST(FuseMulBeforeDepthwiseConvolution2DTest, Smoke) {
  DepthwiseConvolution2DAttributes attr;
  attr.weights.shape = OHWI(2, 1, 2, 2);
  attr.weights.data = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f};
  attr.bias.shape = Linear(4);
  attr.bias.data = {1.5f, 2.5f, 1.0f, 2.0f};
  Tensor<Linear, DataType::FLOAT32> mul_tensor;
  mul_tensor.shape = Linear(4);
  mul_tensor.data = {0.5f, 2.0f, 4.0f, 0.25f};
  MultiplyScalarAttributes mul_attr;
  mul_attr.param = mul_tensor;
  FuseMultiplyWithDepthwiseConvolution2D(mul_attr, &attr);
  EXPECT_THAT(attr.weights.data,
              Pointwise(FloatNear(1e-6),
                        {0.05f, 0.4f, 0.15f, 0.8f, 0.25f, 1.2f, 0.35f, 1.6f}));
  EXPECT_THAT(attr.bias.data,
              Pointwise(FloatNear(1e-6), {1.5f, 2.5f, 1.0f, 2.0f}));
 }
 TEST(FuseMulBeforeConvolutionTransposedTest, Smoke) {
  ConvolutionTransposedAttributes attr;
  attr.weights.shape = OHWI(2, 1, 2, 2);
  attr.weights.data = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f};
  attr.bias.shape = Linear(2);
  attr.bias.data = {1.5f, 2.5f};
  Tensor<Linear, DataType::FLOAT32> mul_tensor;
  mul_tensor.shape = Linear(2);
  mul_tensor.data = {0.5f, 2.0f};
  MultiplyScalarAttributes mul_attr;
  mul_attr.param = mul_tensor;
  FuseMultiplyWithConvolutionTransposed(mul_attr, &attr);
  EXPECT_THAT(attr.weights.data,
              Pointwise(FloatNear(1e-6),
                        {0.05f, 0.4f, 0.15f, 0.8f, 0.25f, 1.2f, 0.35f, 1.6f}));
  EXPECT_THAT(attr.bias.data, Pointwise(FloatNear(1e-6), {1.5f, 2.5f}));
 }
 TEST(FuseMulBeforeFullyConnectedTest, Smoke) {
  FullyConnectedAttributes attr;
  attr.weights.shape = OHWI(2, 1, 1, 2);
  attr.weights.data = {0.1f, 0.2f, 0.3f, 0.4f};
  attr.bias.shape = Linear(2);
  attr.bias.data = {1.5f, 2.5f};
  Tensor<Linear, DataType::FLOAT32> mul_tensor;
  mul_tensor.shape = Linear(2);
  mul_tensor.data = {0.5f, 2.0f};
  MultiplyScalarAttributes mul_attr;
  mul_attr.param = mul_tensor;
  FuseMultiplyWithFullyConnected(mul_attr, &attr);
  EXPECT_THAT(attr.weights.data,
              Pointwise(FloatNear(1e-6), {0.05f, 0.4f, 0.15f, 0.8f}));
  EXPECT_THAT(attr.bias.data, Pointwise(FloatNear(1e-6), {1.5f, 2.5f}));
 }
 }  // namespace
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/common/transformations/general_transformations.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/general_transformations.cc
@ -0,0 +1,58 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/common/transformations/general_transformations.h"
 #include "tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.h"
 #include "tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.h"
 #include "tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected.h"
 #include "tensorflow/lite/delegates/gpu/common/transformations/make_padding.h"
 #include "tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.h"
 #include "tensorflow/lite/delegates/gpu/common/transformations/remove_noop.h"
 namespace tflite {
 namespace gpu {
 bool ApplyGeneralTransformations(ModelTransformer* transformer) {
  // whenever any of these transforms return false, that means that a graph
  // is in the broken state and processing should not continue.
  return transformer->Apply("remove_degenerate_upsampling",
                            NewRemoveDegenerateUpsampling().get()) &&
         transformer->Apply("remove_single_input_add",
                            NewRemoveSingleInputAdd().get()) &&
         transformer->Apply("remove_single_input_concat",
                            NewRemoveSingleInputConcat().get()) &&
         transformer->Apply("make_padding_from_concat",
                            NewMakePaddingFromConcat().get()) &&
         transformer->Apply("make_fully_connected_from_convolution",
                            NewMakeFullyConnectedFromConvolution().get()) &&
         transformer->Apply("merge_padding_with_convolution",
                            NewMergePaddingWithConvolution2D().get()) &&
         transformer->Apply("merge_padding_with_pooling",
                            NewMergePaddingWithPooling().get()) &&
         transformer->Apply("merge_padding_with_depthwise_convolution",
                            NewMergePaddingWithDepthwiseConvolution().get()) &&
         transformer->Apply("merge_convolution_with_mul",
                            NewMergeConvolutionWithMul().get()) &&
         transformer->Apply("merge_convolution_with_add",
                            NewMergeConvolutionWithAdd().get()) &&
         transformer->Apply("merge_mul_with_convolution",
                            NewMergeMulWithConvolution().get()) &&
         transformer->Apply("merge_add_with_convolution",
                            NewMergeAddWithConvolution().get());
 }
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/common/transformations/general_transformations.h
+++ b/tensorflow/lite/delegates/gpu/common/transformations/general_transformations.h
@ -0,0 +1,30 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_GENERAL_TRANSFORMATIONS_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_GENERAL_TRANSFORMATIONS_H_
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 namespace tflite {
 namespace gpu {
 // @return false when something went wrong that turned a graph in a broken state
 bool ApplyGeneralTransformations(ModelTransformer* transformer);
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_GENERAL_TRANSFORMATIONS_H_
--- a/tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected.cc
@ -0,0 +1,77 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected.h"
 #include "absl/memory/memory.h"
 #include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 namespace tflite {
 namespace gpu {
 namespace {
 bool IsConvEquivalentToFullyConnected(const Convolution2DAttributes& attr) {
  return attr.weights.shape.w == 1 &&           //
         attr.weights.shape.h == 1 &&           //
         attr.strides == HW(1, 1) &&            //
         attr.dilations == HW(1, 1) &&          //
         attr.padding.prepended == HW(0, 0) &&  //
         attr.padding.appended == HW(0, 0);
 }
 class MakeFullyConnectedFromConvolution : public NodeTransformation {
 public:
  TransformResult ApplyToNode(Node* node, GraphFloat32* graph) final {
    if (node->operation.type != ToString(OperationType::CONVOLUTION_2D)) {
      return {TransformStatus::SKIPPED, ""};
    }
    auto inputs = graph->FindInputs(node->id);
    if (inputs.size() != 1) {
      return {TransformStatus::SKIPPED, ""};
    }
    const auto& input_shape = inputs[0]->tensor.shape;
    if (input_shape.w != 1 || input_shape.h != 1) {
      return {TransformStatus::SKIPPED, ""};
    }
    const auto& conv_attr = absl::any_cast<const Convolution2DAttributes&>(
        node->operation.attributes);
    if (!IsConvEquivalentToFullyConnected(conv_attr)) {
      return {TransformStatus::SKIPPED, ""};
    }
    FullyConnectedAttributes fc_attr;
    fc_attr.weights = conv_attr.weights;
    fc_attr.bias = conv_attr.bias;
    node->operation.attributes = fc_attr;
    node->operation.type = ToString(OperationType::FULLY_CONNECTED);
    return {TransformStatus::APPLIED,
            "Replaced convolution with fully connected."};
  }
 };
 }  // namespace
 std::unique_ptr<NodeTransformation> NewMakeFullyConnectedFromConvolution() {
  return absl::make_unique<MakeFullyConnectedFromConvolution>();
 }
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected.h
+++ b/tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected.h
@ -0,0 +1,33 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MAKE_FULLY_CONNECTED_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MAKE_FULLY_CONNECTED_H_
 #include <memory>
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 namespace tflite {
 namespace gpu {
 // Turns convolution with kernel 1x1 and input tensor with h=1 and w=1 into
 // fully connected operation
 std::unique_ptr<NodeTransformation> NewMakeFullyConnectedFromConvolution();
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MAKE_FULLY_CONNECTED_H_
--- a/tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected_test.cc
@ -0,0 +1,108 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 namespace tflite {
 namespace gpu {
 namespace {
 TEST(MakeFullyConnected, Smoke) {
  GraphFloat32 graph;
  auto input = graph.NewValue();
  input->tensor.shape = BHWC(1, 4, 4, 8);
  Convolution2DAttributes attr0;
  attr0.padding.prepended = HW(0, 0);
  attr0.padding.appended = HW(0, 0);
  attr0.strides = HW(1, 1);
  attr0.dilations = HW(1, 1);
  attr0.weights.shape = OHWI(16, 1, 1, 8);
  attr0.bias.shape = Linear(16);
  Convolution2DAttributes attr1;
  attr1.padding.prepended = HW(0, 0);
  attr1.padding.appended = HW(0, 0);
  attr1.strides = HW(4, 4);
  attr1.dilations = HW(1, 1);
  attr1.weights.shape = OHWI(16, 4, 4, 16);
  attr1.bias.shape = Linear(16);
  Convolution2DAttributes attr2;
  attr2.padding.prepended = HW(0, 0);
  attr2.padding.appended = HW(0, 0);
  attr2.strides = HW(1, 1);
  attr2.dilations = HW(1, 1);
  attr2.weights.shape = OHWI(32, 1, 1, 16);
  attr2.bias.shape = Linear(32);
  auto conv1x1_node0 = graph.NewNode();
  conv1x1_node0->operation.type = ToString(OperationType::CONVOLUTION_2D);
  conv1x1_node0->operation.attributes = attr0;
  auto conv4x4_node1 = graph.NewNode();
  conv4x4_node1->operation.type = ToString(OperationType::CONVOLUTION_2D);
  conv4x4_node1->operation.attributes = attr1;
  auto conv1x1_node2 = graph.NewNode();
  conv1x1_node2->operation.type = ToString(OperationType::CONVOLUTION_2D);
  conv1x1_node2->operation.attributes = attr2;
  ASSERT_TRUE(graph.AddConsumer(conv1x1_node0->id, input->id).ok());
  Value<TensorRefFloat32>* output;
  ASSERT_TRUE(AddOutput(&graph, conv1x1_node2, &output).ok());
  output->tensor.shape = BHWC(1, 1, 1, 32);
  Value<TensorRefFloat32>* link1;
  ASSERT_TRUE(
      ConnectTwoNodes(&graph, conv1x1_node0, conv4x4_node1, &link1).ok());
  link1->tensor.shape = BHWC(1, 4, 4, 16);
  Value<TensorRefFloat32>* link2;
  ASSERT_TRUE(
      ConnectTwoNodes(&graph, conv4x4_node1, conv1x1_node2, &link2).ok());
  link2->tensor.shape = BHWC(1, 1, 1, 16);
  ASSERT_EQ(3, graph.nodes().size());
  ASSERT_EQ(4, graph.values().size());
  auto transformation = NewMakeFullyConnectedFromConvolution();
  ModelTransformer transformer(&graph, nullptr);
  transformer.Apply("make_fully_connected", transformation.get());
  ASSERT_EQ(3, graph.nodes().size());
  ASSERT_EQ(4, graph.values().size());
  ASSERT_EQ(ToString(OperationType::CONVOLUTION_2D),
            graph.nodes()[0]->operation.type);
  ASSERT_EQ(ToString(OperationType::CONVOLUTION_2D),
            graph.nodes()[1]->operation.type);
  ASSERT_EQ(ToString(OperationType::FULLY_CONNECTED),
            graph.nodes()[2]->operation.type);
  auto fc_attr = absl::any_cast<FullyConnectedAttributes>(
      graph.nodes()[2]->operation.attributes);
  EXPECT_EQ(OHWI(32, 1, 1, 16), fc_attr.weights.shape);
  EXPECT_EQ(Linear(32), fc_attr.bias.shape);
 }
 }  // namespace
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/common/transformations/make_padding.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/make_padding.cc
@ -0,0 +1,101 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/common/transformations/make_padding.h"
 #include "absl/memory/memory.h"
 #include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 namespace tflite {
 namespace gpu {
 namespace {
 bool IsConstZeros(const Node& node) {
  if (node.operation.type != ToString(OperationType::CONST)) {
    return false;
  }
  auto& attr =
      absl::any_cast<const ConstTensorAttributes&>(node.operation.attributes);
  for (auto f : attr.tensor.data) {
    if (f != 0) {
      return false;
    }
  }
  return true;
 }
 class MakePaddingFromZerosConcat : public NodeTransformation {
 public:
  TransformResult ApplyToNode(Node* node, GraphFloat32* graph) final {
    if (node->operation.type != ToString(OperationType::CONCAT)) {
      return {TransformStatus::SKIPPED, ""};
    }
    auto inputs = graph->FindInputs(node->id);
    if (inputs.size() != 2) {
      return {TransformStatus::SKIPPED, ""};
    }
    bool first = true;
    for (auto input : inputs) {
      auto dep = graph->FindProducer(input->id);
      if (dep != nullptr && IsConstZeros(*dep)) {
        auto& concat_attr =
            absl::any_cast<const ConcatAttributes&>(node->operation.attributes);
        PadAttributes pad_attr;
        pad_attr.type = PaddingContentType::ZEROS;
        pad_attr.appended = HWC(0, 0, 0);
        pad_attr.prepended = HWC(0, 0, 0);
        HWC* p = first ? &pad_attr.prepended : &pad_attr.appended;
        switch (concat_attr.axis) {
          case Axis::HEIGHT:
            p->h = input->tensor.shape.h;
            break;
          case Axis::WIDTH:
            p->w = input->tensor.shape.w;
            break;
          case Axis::CHANNELS:
            p->c = input->tensor.shape.c;
            break;
          default:
            return {TransformStatus::DECLINED,
                    "Padding for concat axis is unsupported: " +
                        ToString(concat_attr.axis)};
        }
        Status status = RemovePrecedingNode(graph, dep, node);
        if (!status.ok()) {
          return {TransformStatus::INVALID,
                  "Unable to remove const node: " + status.error_message()};
        }
        node->operation.attributes = pad_attr;
        node->operation.type = ToString(OperationType::PAD);
        return {TransformStatus::APPLIED, "Replaced concat with padding"};
      }
      first = false;
    }
    return {TransformStatus::SKIPPED, ""};
  }
 };
 }  // namespace
 std::unique_ptr<NodeTransformation> NewMakePaddingFromConcat() {
  return absl::make_unique<MakePaddingFromZerosConcat>();
 }
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/common/transformations/make_padding.h
+++ b/tensorflow/lite/delegates/gpu/common/transformations/make_padding.h
@ -0,0 +1,33 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MAKE_PADDING_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MAKE_PADDING_H_
 #include <memory>
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 namespace tflite {
 namespace gpu {
 // Turns concat that handles only two tensors, where one tensor is zeros, into
 // padding operation.
 std::unique_ptr<NodeTransformation> NewMakePaddingFromConcat();
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MAKE_PADDING_H_
--- a/tensorflow/lite/delegates/gpu/common/transformations/make_padding_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/make_padding_test.cc
@ -0,0 +1,75 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/common/transformations/make_padding.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 namespace tflite {
 namespace gpu {
 namespace {
 TEST(MakePadding, Smoke) {
  GraphFloat32 graph;
  auto input = graph.NewValue();
  input->tensor.shape = BHWC(1, 2, 3, 5);
  auto concat_node = graph.NewNode();
  ASSERT_TRUE(graph.AddConsumer(concat_node->id, input->id).ok());
  concat_node->operation.type = ToString(OperationType::CONCAT);
  ConcatAttributes attr;
  attr.axis = Axis::HEIGHT;
  concat_node->operation.attributes = attr;
  Value<TensorRefFloat32>* output;
  ASSERT_TRUE(AddOutput(&graph, concat_node, &output).ok());
  output->tensor.shape = BHWC(1, 7, 3, 5);
  auto const_node = graph.NewNode();
  const_node->operation.type = ToString(OperationType::CONST);
  ConstTensorAttributes const_attr;
  const_attr.tensor.shape = BHWC(1, 5, 3, 5);
  const_attr.tensor.data =
      std::vector<float>(const_attr.tensor.shape.DimensionsProduct(), 0);
  const_node->operation.attributes = const_attr;
  Value<TensorRefFloat32>* const_link;
  ASSERT_TRUE(
      ConnectTwoNodes(&graph, const_node, concat_node, &const_link).ok());
  const_link->tensor.shape = const_attr.tensor.shape;
  ASSERT_EQ(2, graph.nodes().size());
  auto transformation = NewMakePaddingFromConcat();
  ModelTransformer transformer(&graph, nullptr);
  transformer.Apply("make_padding", transformation.get());
  ASSERT_EQ(1, graph.nodes().size());
  ASSERT_EQ(2, graph.values().size());
  auto pad_node = graph.nodes()[0];
  ASSERT_EQ(ToString(OperationType::PAD), pad_node->operation.type);
  auto pad_attr = absl::any_cast<PadAttributes>(pad_node->operation.attributes);
  EXPECT_EQ(HWC(0, 0, 0), pad_attr.prepended);
  EXPECT_EQ(HWC(5, 0, 0), pad_attr.appended);
 }
 }  // namespace
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/common/transformations/matching.h
+++ b/tensorflow/lite/delegates/gpu/common/transformations/matching.h
@ -0,0 +1,44 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MATCHING_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MATCHING_H_
 // A file provides predicates to match subgraphs.
 #include <string>
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 namespace tflite {
 namespace gpu {
 // Returns true if a container of nodes contains nodes that all match given
 // operation_types.
 template <typename T>
 bool MatchesByOperationType(const T& nodes,
                            const std::vector<std::string>& types) {
  if (nodes.size() != types.size()) return false;
  return std::mismatch(nodes.begin(), nodes.end(), types.begin(),
                       [&](typename T::value_type a, const std::string& b) {
                         return a->operation.type == b;
                       })
             .first == nodes.end();
 }
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MATCHING_H_
--- a/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.cc
@ -0,0 +1,171 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.h"
 #include <string>
 #include <vector>
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/transformations/matching.h"
 namespace tflite {
 namespace gpu {
 namespace {
 template <typename Attr>
 class MergePaddingWith2DOperation : public SequenceTransformation {
 public:
  explicit MergePaddingWith2DOperation(OperationType operation_type)
      : operations_to_match_(
            {ToString(OperationType::PAD), ToString(operation_type)}) {}
  int ExpectedSequenceLength() const final { return 2; }
  TransformResult ApplyToNodesSequence(const std::vector<Node*>& sequence,
                                       GraphFloat32* graph) final {
    if (!MatchesByOperationType(sequence, operations_to_match_)) {
      return {TransformStatus::SKIPPED, ""};
    }
    Node* pad_node = sequence.front();
    Node* op_node = sequence.back();
    PadAttributes pad_attr =
        absl::any_cast<PadAttributes>(pad_node->operation.attributes);
    if (pad_attr.type != PaddingContentType::ZEROS) {
      return {TransformStatus::DECLINED, "Only Zero padding is supported."};
    }
    if (pad_attr.appended.c != 0 || pad_attr.prepended.c != 0) {
      return {TransformStatus::DECLINED,
              "Pad has non-zero padding on non HW axis."};
    }
    Attr* node_attr = absl::any_cast<Attr>(&op_node->operation.attributes);
    Status status = RemovePrecedingNode(graph, pad_node, op_node);
    if (!status.ok()) {
      return {TransformStatus::INVALID,
              "Unable to remove Pad node with Operation node: " +
                  status.error_message()};
    }
    node_attr->padding.appended.h += pad_attr.appended.h;
    node_attr->padding.appended.w += pad_attr.appended.w;
    node_attr->padding.prepended.h += pad_attr.prepended.h;
    node_attr->padding.prepended.w += pad_attr.prepended.w;
    return {
        TransformStatus::APPLIED,
        absl::StrCat("Added padding: prepended = {h = ", pad_attr.prepended.h,
                     ", w = ", pad_attr.prepended.w, "}, appended = { h = ",
                     pad_attr.appended.h, ", w = ", pad_attr.appended.w, "}")};
  }
 private:
  const std::vector<std::string> operations_to_match_;
 };
 }  // namespace
 std::unique_ptr<SequenceTransformation> NewMergePaddingWithPooling() {
  return absl::make_unique<MergePaddingWith2DOperation<Pooling2DAttributes>>(
      OperationType::POOLING_2D);
 }
 std::unique_ptr<SequenceTransformation> NewMergePaddingWithConvolution2D() {
  return absl::make_unique<
      MergePaddingWith2DOperation<Convolution2DAttributes>>(
      OperationType::CONVOLUTION_2D);
 }
 std::unique_ptr<SequenceTransformation>
 NewMergePaddingWithDepthwiseConvolution() {
  return absl::make_unique<
      MergePaddingWith2DOperation<DepthwiseConvolution2DAttributes>>(
      OperationType::DEPTHWISE_CONVOLUTION);
 }
 class MergePaddingWithAddOperation : public NodeTransformation {
 public:
  TransformResult ApplyToNode(Node* node, GraphFloat32* graph) final {
    if (node->operation.type != ToString(OperationType::PAD)) {
      return {TransformStatus::SKIPPED, ""};
    }
    auto inputs = graph->FindInputs(node->id);
    if (inputs.size() != 1) {
      return {TransformStatus::SKIPPED, ""};
    }
    const auto& input_shape = graph->FindInputs(node->id)[0]->tensor.shape;
    if (input_shape.c % 4 != 0) {
      return {TransformStatus::DECLINED,
              "Pad with input where src_channels % 4 != 0"};
    }
    PadAttributes pad_attr =
        absl::any_cast<PadAttributes>(node->operation.attributes);
    if (pad_attr.type != PaddingContentType::ZEROS) {
      return {TransformStatus::DECLINED, "Only Zero padding is supported."};
    }
    if (pad_attr.prepended != HWC(0, 0, 0) || pad_attr.appended.h != 0 ||
        pad_attr.appended.w != 0) {
      return {TransformStatus::DECLINED,
              "Pad has padding not only in appended channels axis."};
    }
    auto pad_output = graph->FindOutputs(node->id)[0];
    auto consumer_nodes = graph->FindConsumers(pad_output->id);
    if (consumer_nodes.size() != 1) {
      return {TransformStatus::SKIPPED, ""};
    }
    auto add_node = consumer_nodes[0];
    auto consumer_type = OperationTypeFromString(add_node->operation.type);
    if (consumer_type != OperationType::ADD) {
      return {TransformStatus::SKIPPED, ""};
    }
    AddAttributes add_attr =
        absl::any_cast<AddAttributes>(add_node->operation.attributes);
    auto add_broadcated_vector =
        absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&add_attr.param);
    if (add_broadcated_vector) {
      return {TransformStatus::SKIPPED,
              "Can not remove padding when this broadcasted ADD"};
    }
    Status status = RemovePrecedingNode(graph, node, add_node);
    if (!status.ok()) {
      return {TransformStatus::INVALID,
              "Unable to remove Pad node " + status.error_message()};
    }
    return {TransformStatus::APPLIED,
            "Removed padding with zeroes in appended channels dimension"};
  }
 };
 std::unique_ptr<NodeTransformation> NewMergePaddingWithAdd() {
  return absl::make_unique<MergePaddingWithAddOperation>();
 }
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.h
+++ b/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.h
@ -0,0 +1,53 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MERGE_PADDING_WITH_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MERGE_PADDING_WITH_H_
 #include <memory>
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 namespace tflite {
 namespace gpu {
 std::unique_ptr<SequenceTransformation> NewMergePaddingWithPooling();
 std::unique_ptr<SequenceTransformation> NewMergePaddingWithConvolution2D();
 std::unique_ptr<SequenceTransformation>
 NewMergePaddingWithDepthwiseConvolution();
 // This transform requires Add operation support of unequal tensors on input.
 // Padding should be with zeroes, and only appended in Z axis.
 // Also input tensor channels should be divisible by 4(aligned).
 // It should replace following pattern:
 // 1) some tensor padded with zeroes in Z dim, for example from 24 to 32
 //   channels
 // 2) than this tensor used only in Add operation and Add operation
 //   adds this useless zeroes on 24-32 channels.
 // It removes this useless addition
 // by using Add with unequal tensors on input. Instead of filling with zeroes
 // and adding this part in Add operation, Add operation makes additional check
 // for this tensor:
 //   if (channels < src_channels) {
 //     result += tensor_from_pad_operation.data[index];
 //   }
 std::unique_ptr<NodeTransformation> NewMergePaddingWithAdd();
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_MERGE_PADDING_WITH_H_
--- a/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with_test.cc
@ -0,0 +1,151 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 namespace tflite {
 namespace gpu {
 namespace {
 TEST(MergePaddingWith, Smoke) {
  GraphFloat32 graph;
  auto input = graph.NewValue();
  auto pad_node = graph.NewNode();
  ASSERT_TRUE(graph.AddConsumer(pad_node->id, input->id).ok());
  pad_node->operation.type = ToString(OperationType::PAD);
  PadAttributes attr;
  attr.prepended = HWC(1, 1, 0);
  attr.appended = HWC(2, 2, 0);
  pad_node->operation.attributes = attr;
  auto conv_node = graph.NewNode();
  Value<TensorRefFloat32>* temp;
  ASSERT_TRUE(ConnectTwoNodes(&graph, pad_node, conv_node, &temp).ok());
  ASSERT_TRUE(AddOutput(&graph, conv_node, &temp).ok());
  conv_node->operation.type = ToString(OperationType::CONVOLUTION_2D);
  Convolution2DAttributes conv_attr;
  conv_attr.padding.appended = HW(0, 0);
  conv_attr.padding.prepended = HW(0, 0);
  conv_node->operation.attributes = conv_attr;
  ASSERT_EQ(2, graph.nodes().size());
  auto transformation = NewMergePaddingWithConvolution2D();
  ModelTransformer transformer(&graph, nullptr);
  transformer.Apply("merge_padding", transformation.get());
  ASSERT_EQ(1, graph.nodes().size());
  ASSERT_EQ(2, graph.values().size());
  ASSERT_EQ(conv_node, graph.nodes()[0]);
  conv_attr =
      absl::any_cast<Convolution2DAttributes>(conv_node->operation.attributes);
  EXPECT_EQ(HW(1, 1), conv_attr.padding.prepended);
  EXPECT_EQ(HW(2, 2), conv_attr.padding.appended);
 }
 TEST(MergePaddingWith, MergeTwo) {
  GraphFloat32 graph;
  auto input = graph.NewValue();
  auto pad_node1 = graph.NewNode();
  ASSERT_TRUE(graph.AddConsumer(pad_node1->id, input->id).ok());
  pad_node1->operation.type = ToString(OperationType::PAD);
  PadAttributes attr;
  attr.prepended = HWC(1, 1, 0);
  attr.appended = HWC(0, 0, 0);
  pad_node1->operation.attributes = attr;
  auto pad_node2 = graph.NewNode();
  Value<TensorRefFloat32>* temp;
  ASSERT_TRUE(ConnectTwoNodes(&graph, pad_node1, pad_node2, &temp).ok());
  pad_node2->operation.type = ToString(OperationType::PAD);
  attr.prepended = HWC(0, 0, 0);
  attr.appended = HWC(2, 2, 0);
  pad_node2->operation.attributes = attr;
  auto conv_node = graph.NewNode();
  ASSERT_TRUE(ConnectTwoNodes(&graph, pad_node2, conv_node, &temp).ok());
  ASSERT_TRUE(AddOutput(&graph, conv_node, &temp).ok());
  conv_node->operation.type = ToString(OperationType::CONVOLUTION_2D);
  Convolution2DAttributes conv_attr;
  conv_attr.padding.appended = HW(0, 0);
  conv_attr.padding.prepended = HW(0, 0);
  conv_node->operation.attributes = conv_attr;
  ASSERT_EQ(3, graph.nodes().size());
  auto transformation = NewMergePaddingWithConvolution2D();
  ModelTransformer transformer(&graph, nullptr);
  transformer.Apply("merge_padding", transformation.get());
  ASSERT_EQ(1, graph.nodes().size());
  ASSERT_EQ(2, graph.values().size());
  ASSERT_EQ(conv_node, graph.nodes()[0]);
  conv_attr =
      absl::any_cast<Convolution2DAttributes>(conv_node->operation.attributes);
  EXPECT_EQ(HW(1, 1), conv_attr.padding.prepended);
  EXPECT_EQ(HW(2, 2), conv_attr.padding.appended);
 }
 TEST(MergePaddingWithAdd, MergeOne) {
  GraphFloat32 graph;
  auto input0 = graph.NewValue();
  input0->tensor.shape = BHWC(1, 4, 4, 8);
  auto input1 = graph.NewValue();
  auto padded = graph.NewValue();
  auto output = graph.NewValue();
  auto pad_node = graph.NewNode();
  pad_node->operation.type = ToString(OperationType::PAD);
  PadAttributes pad_attr;
  pad_attr.prepended = HWC(0, 0, 0);
  pad_attr.appended = HWC(0, 0, 32);
  pad_node->operation.attributes = pad_attr;
  ASSERT_TRUE(graph.AddConsumer(pad_node->id, input0->id).ok());
  ASSERT_TRUE(graph.SetProducer(pad_node->id, padded->id).ok());
  auto add_node = graph.NewNode();
  AddAttributes add_attr;
  ASSERT_TRUE(graph.AddConsumer(add_node->id, padded->id).ok());
  ASSERT_TRUE(graph.AddConsumer(add_node->id, input1->id).ok());
  ASSERT_TRUE(graph.SetProducer(add_node->id, output->id).ok());
  add_node->operation.type = ToString(OperationType::ADD);
  add_node->operation.attributes = add_attr;
  ASSERT_EQ(2, graph.nodes().size());
  ASSERT_EQ(4, graph.values().size());
  auto transformation = NewMergePaddingWithAdd();
  ModelTransformer transformer(&graph, nullptr);
  transformer.Apply("merge_padding", transformation.get());
  ASSERT_EQ(1, graph.nodes().size());
  ASSERT_EQ(3, graph.values().size());
  EXPECT_EQ(add_node, graph.nodes()[0]);
 }
 }  // namespace
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/common/transformations/remove_noop.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/remove_noop.cc
@ -0,0 +1,100 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/common/transformations/remove_noop.h"
 #include <string>
 #include <vector>
 #include "absl/memory/memory.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 namespace tflite {
 namespace gpu {
 namespace {
 using ShouldRemoveOperation = std::function<bool(GraphFloat32* graph, Node*)>;
 class RemoveOperation : public SequenceTransformation {
 public:
  explicit RemoveOperation(ShouldRemoveOperation remove_predicate)
      : remove_predicate_(std::move(remove_predicate)) {}
  int ExpectedSequenceLength() const final { return 2; }
  TransformResult ApplyToNodesSequence(const std::vector<Node*>& sequence,
                                       GraphFloat32* graph) final {
    Node* prev_op_node = sequence.front();
    Node* op_node = sequence.back();
    if (!remove_predicate_(graph, op_node)) {
      return {TransformStatus::SKIPPED, ""};
    }
    Status status = RemoveFollowingNode(graph, op_node, prev_op_node);
    if (!status.ok()) {
      return {TransformStatus::INVALID,
              "Unable to remove a node: " + status.error_message()};
    }
    return {TransformStatus::APPLIED, ""};
  }
 private:
  ShouldRemoveOperation remove_predicate_;
 };
 }  // namespace
 std::unique_ptr<SequenceTransformation> NewRemoveSingleInputConcat() {
  // Using SequenceTransformation implies that CONCAT has a single input.
  auto type = ToString(OperationType::CONCAT);
  return absl::make_unique<RemoveOperation>(
      [type](GraphFloat32* graph, Node* node) {
        return type == node->operation.type;
      });
 }
 std::unique_ptr<SequenceTransformation> NewRemoveSingleInputAdd() {
  // Using SequenceTransformation implies that ADD has a single input.
  auto type = ToString(OperationType::ADD);
  return absl::make_unique<RemoveOperation>(
      [type](GraphFloat32* graph, Node* node) {
        if (node->operation.type != type) {
          return false;
        }
        auto& attr =
            absl::any_cast<const AddAttributes&>(node->operation.attributes);
        return absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&attr.param) ==
               nullptr;
      });
 }
 std::unique_ptr<SequenceTransformation> NewRemoveDegenerateUpsampling() {
  auto type = ToString(OperationType::UPSAMPLE_2D);
  return absl::make_unique<RemoveOperation>(
      [type](GraphFloat32* graph, Node* node) {
        if (node->operation.type != type) {
          return false;
        }
        auto inputs = graph->FindInputs(node->id);
        auto outputs = graph->FindOutputs(node->id);
        return inputs.size() == 1 && outputs.size() == 1 &&
               inputs[0]->tensor.shape == outputs[0]->tensor.shape;
      });
 }
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/common/transformations/remove_noop.h
+++ b/tensorflow/lite/delegates/gpu/common/transformations/remove_noop.h
@ -0,0 +1,35 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_REMOVE_NOOP_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_REMOVE_NOOP_H_
 #include <memory>
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 namespace tflite {
 namespace gpu {
 std::unique_ptr<SequenceTransformation> NewRemoveSingleInputConcat();
 std::unique_ptr<SequenceTransformation> NewRemoveSingleInputAdd();
 std::unique_ptr<SequenceTransformation> NewRemoveDegenerateUpsampling();
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_REMOVE_NOOP_H_
--- a/tensorflow/lite/delegates/gpu/common/transformations/remove_noop_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/remove_noop_test.cc
@ -0,0 +1,146 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/common/transformations/remove_noop.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 namespace tflite {
 namespace gpu {
 namespace {
 TEST(RemoveSingleInputAdd, Smoke) {
  GraphFloat32 graph;
  auto input = graph.NewValue();
  auto first_node = graph.NewNode();
  ASSERT_TRUE(graph.AddConsumer(first_node->id, input->id).ok());
  auto add_node = graph.NewNode();
  Value<TensorRefFloat32>* output;
  ASSERT_TRUE(AddOutput(&graph, add_node, &output).ok());
  add_node->operation.type = ToString(OperationType::ADD);
  add_node->operation.attributes = AddAttributes();
  Value<TensorRefFloat32>* temp;
  ASSERT_TRUE(ConnectTwoNodes(&graph, first_node, add_node, &temp).ok());
  ASSERT_EQ(2, graph.nodes().size());
  ASSERT_EQ(3, graph.values().size());
  auto transformation = NewRemoveSingleInputAdd();
  ModelTransformer transformer(&graph, nullptr);
  transformer.Apply("noop", transformation.get());
  EXPECT_EQ(1, graph.nodes().size());
  ASSERT_EQ(2, graph.values().size());
  ASSERT_EQ(first_node, graph.nodes()[0]);
  ASSERT_EQ(input, graph.values()[0]);
  ASSERT_EQ(output, graph.values()[1]);
 }
 TEST(RemoveSingleInputAdd, DoNotTrigger_Tensor) {
  GraphFloat32 graph;
  auto input = graph.NewValue();
  auto first_node = graph.NewNode();
  ASSERT_TRUE(graph.AddConsumer(first_node->id, input->id).ok());
  auto add_node = graph.NewNode();
  Value<TensorRefFloat32>* output;
  ASSERT_TRUE(AddOutput(&graph, add_node, &output).ok());
  add_node->operation.type = ToString(OperationType::ADD);
  AddAttributes attr;
  attr.param = Tensor<Linear, DataType::FLOAT32>();
  add_node->operation.attributes = attr;
  Value<TensorRefFloat32>* temp;
  ASSERT_TRUE(ConnectTwoNodes(&graph, first_node, add_node, &temp).ok());
  ASSERT_EQ(2, graph.nodes().size());
  ASSERT_EQ(3, graph.values().size());
  auto transformation = NewRemoveSingleInputAdd();
  ModelTransformer transformer(&graph, nullptr);
  transformer.Apply("noop", transformation.get());
  EXPECT_EQ(2, graph.nodes().size());
  ASSERT_EQ(3, graph.values().size());
 }
 TEST(RemoveSingleInputAdd, DoNotTrigger_Multiple) {
  GraphFloat32 graph;
  auto input = graph.NewValue();
  auto node_a = graph.NewNode();
  auto node_b = graph.NewNode();
  ASSERT_TRUE(graph.AddConsumer(node_a->id, input->id).ok());
  ASSERT_TRUE(graph.AddConsumer(node_b->id, input->id).ok());
  auto add_node = graph.NewNode();
  Value<TensorRefFloat32>* output;
  ASSERT_TRUE(AddOutput(&graph, add_node, &output).ok());
  add_node->operation.type = ToString(OperationType::ADD);
  Value<TensorRefFloat32>* temp;
  ASSERT_TRUE(ConnectTwoNodes(&graph, node_a, add_node, &temp).ok());
  ASSERT_TRUE(ConnectTwoNodes(&graph, node_b, add_node, &temp).ok());
  ASSERT_EQ(3, graph.nodes().size());
  ASSERT_EQ(4, graph.values().size());
  auto transformation = NewRemoveSingleInputAdd();
  ModelTransformer transformer(&graph, nullptr);
  transformer.Apply("noop", transformation.get());
  ASSERT_EQ(3, graph.nodes().size());
  ASSERT_EQ(4, graph.values().size());
 }
 TEST(RemoveDegenerateUpsampling, Smoke) {
  GraphFloat32 graph;
  auto input = graph.NewValue();
  auto first_node = graph.NewNode();
  ASSERT_TRUE(graph.AddConsumer(first_node->id, input->id).ok());
  auto node_to_remove = graph.NewNode();
  Value<TensorRefFloat32>* output;
  ASSERT_TRUE(AddOutput(&graph, node_to_remove, &output).ok());
  output->tensor.shape = BHWC(1, 5, 5, 1);
  node_to_remove->operation.type = ToString(OperationType::UPSAMPLE_2D);
  Upsample2DAttributes attr;
  attr.new_shape = HW(5, 5);
  attr.type = UpsamplingType::BILINEAR;
  node_to_remove->operation.attributes = attr;
  Value<TensorRefFloat32>* link;
  ASSERT_TRUE(ConnectTwoNodes(&graph, first_node, node_to_remove, &link).ok());
  link->tensor.shape = output->tensor.shape;
  ASSERT_EQ(2, graph.nodes().size());
  ASSERT_EQ(3, graph.values().size());
  auto transformation = NewRemoveDegenerateUpsampling();
  ModelTransformer transformer(&graph, nullptr);
  transformer.Apply("noop", transformation.get());
  ASSERT_EQ(1, graph.nodes().size());
  ASSERT_EQ(2, graph.values().size());
  EXPECT_EQ(first_node, graph.nodes()[0]);
  EXPECT_EQ(input, graph.values()[0]);
  EXPECT_EQ(output, graph.values()[1]);
 }
 }  // namespace
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/common/types.h
+++ b/tensorflow/lite/delegates/gpu/common/types.h
@ -0,0 +1,208 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TYPES_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TYPES_H_
 #include <array>
 #include <cstddef>
 #include <cstdint>
 #include <string>
 #include <fp16.h>
 namespace tflite {
 namespace gpu {
 // TODO(akulik): make these types Google-style compliant.
 using HalfBits = uint16_t;
 class alignas(2) half {
 public:
  HalfBits bits;
  half() = default;
  half(const half& f) : bits(f.bits) {}
  explicit half(float other) { bits = fp16_ieee_from_fp32_value(other); }
  void operator=(float f) { *this = half(f); }
  operator float() const { return fp16_ieee_to_fp32_value(bits); }
 };
 template <typename T>
 struct alignas(sizeof(T)) Vec4 {
  union {
    struct {
      T x, y, z, w;
    };
    std::array<T, 4> data_;
  };
  Vec4() : Vec4(T(0.0f)) {}
  template <typename S>
  Vec4(S x_, S y_, S z_, S w_) : x(x_), y(y_), z(z_), w(w_) {}
  explicit Vec4(T v) : x(v), y(v), z(v), w(v) {}
  template <typename S>
  explicit Vec4(S v) : x(v), y(v), z(v), w(v) {}
  Vec4(const Vec4& f) : x(f.x), y(f.y), z(f.z), w(f.w) {}
  template <typename S>
  Vec4(const Vec4<S>& f) : x(f.x), y(f.y), z(f.z), w(f.w) {}
  Vec4& operator=(const Vec4& other) {
    x = other.x;
    y = other.y;
    z = other.z;
    w = other.w;
    return *this;
  }
  static constexpr int size() { return 4; }
  T& operator[](size_t n) { return data_[n]; }
  T operator[](size_t n) const { return data_[n]; }
  bool operator==(const Vec4& value) const {
    return data_[0] == value[0] && data_[1] == value[1] &&
           data_[2] == value[2] && data_[3] == value[3];
  }
  bool operator!=(const Vec4& value) const {
    return !(this->operator==(value));
  }
 };
 template <typename T>
 struct alignas(sizeof(T)) Vec3 {
  union {
    struct {
      T x, y, z;
    };
    std::array<T, 3> data_;
  };
  Vec3() : Vec3(T(0.0f)) {}
  template <typename S>
  constexpr Vec3(S x_, S y_, S z_) : x(x_), y(y_), z(z_) {}
  explicit Vec3(T v) : x(v), y(v), z(v) {}
  template <typename S>
  explicit Vec3(S v) : x(v), y(v), z(v) {}
  Vec3(const Vec3& f) : x(f.x), y(f.y), z(f.z) {}
  template <typename S>
  Vec3(const Vec3<S>& f) : x(f.x), y(f.y), z(f.z) {}
  Vec3& operator=(const Vec3& other) {
    x = other.x;
    y = other.y;
    z = other.z;
    return *this;
  }
  static constexpr int size() { return 3; }
  T& operator[](size_t n) { return data_[n]; }
  T operator[](size_t n) const { return data_[n]; }
  bool operator==(const Vec3& value) const {
    return data_[0] == value[0] && data_[1] == value[1] && data_[2] == value[2];
  }
  bool operator!=(const Vec3& value) const {
    return !(this->operator==(value));
  }
 };
 template <typename T>
 struct alignas(sizeof(T)) Vec2 {
  union {
    struct {
      T x, y;
    };
    std::array<T, 2> data_;
  };
  Vec2() : Vec2(T(0.0f)) {}
  template <typename S>
  Vec2(S x_, S y_) : x(x_), y(y_) {}
  explicit Vec2(T v) : x(v), y(v) {}
  template <typename S>
  explicit Vec2(S v) : x(v), y(v) {}
  Vec2(const Vec2& f) : x(f.x), y(f.y) {}
  template <typename S>
  Vec2(const Vec2<S>& f) : x(f.x), y(f.y) {}
  Vec2& operator=(const Vec2& other) {
    x = other.x;
    y = other.y;
    return *this;
  }
  bool operator==(const Vec2& value) const {
    return data_[0] == value[0] && data_[1] == value[1];
  }
  bool operator!=(const Vec2& value) const {
    return !(this->operator==(value));
  }
  static constexpr int size() { return 2; }
  T& operator[](size_t n) { return data_[n]; }
  T operator[](size_t n) const { return data_[n]; }
 };
 using float2 = Vec2<float>;
 using half2 = Vec2<half>;
 using byte2 = Vec2<int8_t>;
 using ubyte2 = Vec2<uint8_t>;
 using short2 = Vec2<int16_t>;
 using ushort2 = Vec2<uint16_t>;
 using int2 = Vec2<int32_t>;
 using uint2 = Vec2<uint32_t>;
 using float3 = Vec3<float>;
 using half3 = Vec3<half>;
 using byte3 = Vec3<int8_t>;
 using ubyte3 = Vec3<uint8_t>;
 using short3 = Vec3<int16_t>;
 using ushort3 = Vec3<uint16_t>;
 using int3 = Vec3<int32_t>;
 using uint3 = Vec3<uint32_t>;
 using float4 = Vec4<float>;
 using half4 = Vec4<half>;
 using byte4 = Vec4<int8_t>;
 using ubyte4 = Vec4<uint8_t>;
 using short4 = Vec4<int16_t>;
 using ushort4 = Vec4<uint16_t>;
 using int4 = Vec4<int32_t>;
 using uint4 = Vec4<uint32_t>;
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TYPES_H_
--- a/tensorflow/lite/delegates/gpu/common/util.h
+++ b/tensorflow/lite/delegates/gpu/common/util.h
@ -0,0 +1,51 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_UTIL_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_UTIL_H_
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 namespace tflite {
 namespace gpu {
 // @param n must be non negative
 // @param divisor must be greater than zero
 template <typename T, typename N>
 T IntegralDivideRoundUp(T n, N divisor) {
  const T div = static_cast<T>(divisor);
  const T q = n / div;
  return n % div == 0 ? q : q + 1;
 }
 template <>
 inline ::tflite::gpu::uint3 IntegralDivideRoundUp(
    ::tflite::gpu::uint3 n, ::tflite::gpu::uint3 divisor) {
  return ::tflite::gpu::uint3(IntegralDivideRoundUp(n.x, divisor.x),
                              IntegralDivideRoundUp(n.y, divisor.y),
                              IntegralDivideRoundUp(n.z, divisor.z));
 }
 // @param number or its components must be greater than zero
 // @param n must be greater than zero
 template <typename T, typename N>
 T AlignByN(T number, N n) {
  return IntegralDivideRoundUp(number, n) * n;
 }
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_UTIL_H_
--- a/tensorflow/lite/delegates/gpu/common/util_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/util_test.cc
@ -0,0 +1,53 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/common/util.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 namespace tflite {
 namespace gpu {
 namespace {
 using testing::Eq;
 TEST(UtilTest, IntegralDivideRoundUp) {
  EXPECT_THAT(IntegralDivideRoundUp(0, 256), Eq(0));
  EXPECT_THAT(IntegralDivideRoundUp(2u, 256), Eq(1));
  EXPECT_THAT(IntegralDivideRoundUp(2, 256), Eq(1));
  EXPECT_THAT(IntegralDivideRoundUp(255u, 256), Eq(1));
  EXPECT_THAT(IntegralDivideRoundUp(255, 256), Eq(1));
  EXPECT_THAT(IntegralDivideRoundUp(256u, 256), Eq(1));
  EXPECT_THAT(IntegralDivideRoundUp(256, 256), Eq(1));
  EXPECT_THAT(IntegralDivideRoundUp(257u, 256), Eq(2));
  EXPECT_THAT(IntegralDivideRoundUp(257, 256), Eq(2));
 }
 TEST(UtilTest, AlignByN) {
  EXPECT_THAT(AlignByN(0u, 256), Eq(0));
  EXPECT_THAT(AlignByN(1u, 256), Eq(256));
  EXPECT_THAT(AlignByN(255u, 256), Eq(256));
  EXPECT_THAT(AlignByN(256u, 256), Eq(256));
  EXPECT_THAT(AlignByN(257u, 256), Eq(512));
  EXPECT_THAT(AlignByN(1, 4), Eq(4));
  EXPECT_THAT(AlignByN(80, 4), Eq(80));
  EXPECT_THAT(AlignByN(81, 4), Eq(84));
 }
 }  // namespace
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/gl/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/BUILD
@ -0,0 +1,433 @@
 package(default_visibility = ["//visibility:public"])
 licenses(["notice"])  # Apache 2.0
 load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
 cc_library(
    name = "api",
    srcs = ["api.cc"],
    hdrs = ["api.h"],
    deps = [
        ":command_queue",
        ":compiler",
        ":compiler_options",
        ":gl_call",
        ":gpu_info",
        ":node_shader",
        ":object",
        ":object_manager",
        ":portable",
        ":runtime",
        ":runtime_options",
        ":stats",
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/strings",
        "//tensorflow/lite/delegates/gpu/common:model",
        "//tensorflow/lite/delegates/gpu/common:status",
        "//tensorflow/lite/delegates/gpu/common:types",
        "//tensorflow/lite/delegates/gpu/common:util",
        "//tensorflow/lite/delegates/gpu/gl/workgroups:calculator",
    ] + select({
        "//tensorflow/lite/delegates/gpu:tflite_gpu_binary_release": [],
        "//conditions:default": [
            ":serialization",
        ],
    }),
 )
 cc_library(
    name = "command_queue",
    srcs = ["command_queue.cc"],
    hdrs = ["command_queue.h"],
    deps = [
        ":gl_call",
        ":gl_program",
        ":gl_sync",
        ":gpu_info",
        ":portable",
        "//tensorflow/lite/delegates/gpu/common:status",
        "//tensorflow/lite/delegates/gpu/common:types",
        "@com_google_absl//absl/memory",
    ],
 )
 flatbuffer_cc_library(
    name = "common_cc_fbs",
    srcs = ["common.fbs"],
 )
 # Generic schema for inference on GPU device.
 flatbuffer_cc_library(
    name = "compiled_model_cc_fbs",
    srcs = ["compiled_model.fbs"],
    flatc_args = [
        "--scoped-enums",
    ],
    includes = [
        "//tensorflow/lite/delegates/gpu/gl:common_cc_fbs_includes",
    ],
 )
 cc_library(
    name = "compiler",
    srcs = ["compiler.cc"],
    hdrs = ["compiler.h"],
    deps = [
        ":compiler_options",
        ":float16_conversions",
        ":gpu_info",
        ":node_shader",
        "//tensorflow/lite/delegates/gpu/common:data_type",
        "//tensorflow/lite/delegates/gpu/common:model",
        "//tensorflow/lite/delegates/gpu/common:model_transformer",
        "//tensorflow/lite/delegates/gpu/common:operations",
        "//tensorflow/lite/delegates/gpu/common:status",
        "//tensorflow/lite/delegates/gpu/common:types",
        "//tensorflow/lite/delegates/gpu/gl/compiler:compiled_node",
        "//tensorflow/lite/delegates/gpu/gl/compiler:fuse_auto_input",
        "//tensorflow/lite/delegates/gpu/gl/compiler:fuse_inline",
        "//tensorflow/lite/delegates/gpu/gl/compiler:fuse_inplace",
        "//tensorflow/lite/delegates/gpu/gl/compiler:shader_code",
        "//tensorflow/lite/delegates/gpu/gl/compiler:shader_codegen",
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/types:any",
    ],
 )
 cc_library(
    name = "compiler_options",
    hdrs = ["compiler_options.h"],
    deps = [
        ":gpu_info",
        ":object",
    ],
 )
 cc_library(
    name = "egl_context",
    srcs = ["egl_context.cc"],
    hdrs = ["egl_context.h"],
    deps = [
        ":gl_call",
        ":gl_errors",
        ":portable",
        "//tensorflow/lite/delegates/gpu/common:status",
    ],
 )
 cc_library(
    name = "egl_environment",
    srcs = ["egl_environment.cc"],
    hdrs = ["egl_environment.h"],
    deps = [
        ":egl_context",
        ":egl_surface",
        ":gl_call",
        ":gpu_info",
        ":portable",
        "//tensorflow/lite/delegates/gpu/common:status",
        "@com_google_absl//absl/memory",
    ],
 )
 cc_library(
    name = "egl_surface",
    srcs = ["egl_surface.cc"],
    hdrs = ["egl_surface.h"],
    deps = [
        ":gl_call",
        ":gl_errors",
        ":portable",
        "//tensorflow/lite/delegates/gpu/common:status",
    ],
 )
 cc_library(
    name = "float16_conversions",
    srcs = ["float16_conversions.cc"],
    hdrs = ["float16_conversions.h"],
    deps = [
        ":object",
        "//tensorflow/lite/delegates/gpu/common:data_type",
        "//tensorflow/lite/delegates/gpu/common:tensor",
        "@FP16",
        "@com_google_absl//absl/types:variant",
    ],
 )
 cc_library(
    name = "gl_buffer",
    srcs = ["gl_buffer.cc"],
    hdrs = ["gl_buffer.h"],
    deps = [
        ":gl_call",
        ":gl_errors",
        ":portable",
        "//tensorflow/lite/delegates/gpu/common:status",
        "@com_google_absl//absl/types:span",
    ],
 )
 cc_test(
    name = "gl_buffer_test",
    srcs = ["gl_buffer_test.cc"],
    linkopts = [
        "-lGLESv3",
        "-lEGL",
    ],
    tags = [
        "local",
        "nobuilder",
        "notap",
    ],
    deps = [
        ":egl_environment",
        ":gl_buffer",
        "//tensorflow/lite/delegates/gpu/common:status",
        "@com_google_googletest//:gtest_main",
    ],
 )
 cc_library(
    name = "gl_call",
    hdrs = ["gl_call.h"],
    deps = [
        ":gl_errors",
        "//tensorflow/lite/delegates/gpu/common:status",
    ],
 )
 cc_library(
    name = "gl_errors",
    srcs = ["gl_errors.cc"],
    hdrs = ["gl_errors.h"],
    deps = [
        ":portable",
        "//tensorflow/lite/delegates/gpu/common:status",
        "@com_google_absl//absl/strings",
    ],
 )
 cc_library(
    name = "gl_program",
    srcs = ["gl_program.cc"],
    hdrs = ["gl_program.h"],
    deps = [
        ":gl_call",
        ":gl_errors",
        ":gl_shader",
        ":portable",
        ":uniform_parameter",
        "//tensorflow/lite/delegates/gpu/common:status",
        "//tensorflow/lite/delegates/gpu/common:types",
        "@com_google_absl//absl/types:variant",
    ],
 )
 cc_library(
    name = "gl_shader",
    srcs = ["gl_shader.cc"],
    hdrs = ["gl_shader.h"],
    deps = [
        ":gl_call",
        ":gl_errors",
        ":portable",
        "//tensorflow/lite/delegates/gpu/common:status",
    ],
 )
 cc_library(
    name = "gl_texture",
    srcs = ["gl_texture.cc"],
    hdrs = ["gl_texture.h"],
    deps = [
        ":gl_call",
        ":gl_errors",
        ":portable",
        "//tensorflow/lite/delegates/gpu/common:data_type",
        "//tensorflow/lite/delegates/gpu/common:status",
        "//tensorflow/lite/delegates/gpu/common:tensor",
        "//tensorflow/lite/delegates/gpu/common:types",
        "@com_google_absl//absl/types:span",
    ],
 )
 cc_library(
    name = "gl_sync",
    srcs = ["gl_sync.cc"],
    hdrs = ["gl_sync.h"],
    deps = [
        ":gl_call",
        ":gl_errors",
        ":portable",
        "//tensorflow/lite/delegates/gpu/common:status",
    ],
 )
 cc_library(
    name = "gpu_info",
    srcs = ["gpu_info.cc"],
    hdrs = ["gpu_info.h"],
    deps = [
        ":gl_errors",
        ":portable",
        "//tensorflow/lite/delegates/gpu/common:status",
        "@com_google_absl//absl/strings",
    ],
 )
 flatbuffer_cc_library(
    name = "metadata_cc_fbs",
    srcs = ["metadata.fbs"],
    includes = [
        "//tensorflow/lite/delegates/gpu/gl:common_cc_fbs_includes",
        "//tensorflow/lite/delegates/gpu/gl:workgroups_cc_fbs_includes",
    ],
 )
 cc_library(
    name = "node_shader",
    hdrs = ["node_shader.h"],
    deps = [
        ":compiler_options",
        ":gpu_info",
        ":object",
        ":uniform_parameter",
        "//tensorflow/lite/delegates/gpu/common:model",
        "//tensorflow/lite/delegates/gpu/common:status",
        "//tensorflow/lite/delegates/gpu/common:types",
    ],
 )
 cc_library(
    name = "object",
    hdrs = ["object.h"],
    deps = [
        "//tensorflow/lite/delegates/gpu/common:data_type",
        "//tensorflow/lite/delegates/gpu/common:shape",
        "//tensorflow/lite/delegates/gpu/common:types",
        "//tensorflow/lite/delegates/gpu/common:util",
        "@com_google_absl//absl/types:variant",
    ],
 )
 cc_library(
    name = "object_manager",
    srcs = ["object_manager.cc"],
    hdrs = ["object_manager.h"],
    deps = [
        ":gl_buffer",
        ":gl_texture",
        ":stats",
        "//tensorflow/lite/delegates/gpu/common:convert",
        "//tensorflow/lite/delegates/gpu/common:model",
        "//tensorflow/lite/delegates/gpu/common:status",
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/types:span",
    ],
 )
 cc_library(
    name = "portable",
    hdrs = [
        "portable_egl.h",
        "portable_gl31.h",
    ],
 )
 cc_library(
    name = "runtime",
    srcs = ["runtime.cc"],
    hdrs = ["runtime.h"],
    deps = [
        ":command_queue",
        ":gl_buffer",
        ":gl_call",
        ":gl_errors",
        ":gl_program",
        ":gl_shader",
        ":gl_texture",
        ":gpu_info",
        ":object",
        ":object_manager",
        ":portable",
        ":runtime_options",
        ":stats",
        ":uniform_parameter",
        "//tensorflow/lite/delegates/gpu/common:data_type",
        "//tensorflow/lite/delegates/gpu/common:status",
        "//tensorflow/lite/delegates/gpu/common:types",
        "//tensorflow/lite/delegates/gpu/gl/runtime:shared_buffer",
        "@com_google_absl//absl/strings",
        "@com_google_absl//absl/types:variant",
    ],
 )
 cc_library(
    name = "runtime_options",
    hdrs = ["runtime_options.h"],
 )
 cc_library(
    name = "serialization",
    srcs = ["serialization.cc"],
    hdrs = ["serialization.h"],
    deps = [
        ":common_cc_fbs",
        ":compiled_model_cc_fbs",
        ":object",
        ":uniform_parameter",
        "//tensorflow/lite/delegates/gpu/common:data_type",
        "//tensorflow/lite/delegates/gpu/common:status",
        "//tensorflow/lite/delegates/gpu/common:types",
        "@com_google_absl//absl/types:span",
        "@com_google_absl//absl/types:variant",
        "@flatbuffers",
    ],
 )
 cc_test(
    name = "serialization_test",
    srcs = ["serialization_test.cc"],
    tags = [
        "local",
        "nobuilder",
        "notap",
    ],
    deps = [
        ":object",
        ":serialization",
        ":uniform_parameter",
        "//tensorflow/lite/delegates/gpu/common:shape",
        "//tensorflow/lite/delegates/gpu/common:status",
        "//tensorflow/lite/delegates/gpu/common:types",
        "@com_google_absl//absl/types:span",
        "@com_google_googletest//:gtest_main",
    ],
 )
 cc_library(
    name = "stats",
    hdrs = ["stats.h"],
    deps = [
        "@com_google_absl//absl/strings",
    ],
 )
 cc_library(
    name = "uniform_parameter",
    hdrs = ["uniform_parameter.h"],
    deps = [
        "//tensorflow/lite/delegates/gpu/common:types",
        "@com_google_absl//absl/types:variant",
    ],
 )
 flatbuffer_cc_library(
    name = "workgroups_cc_fbs",
    srcs = ["workgroups.fbs"],
    includes = [
        "//tensorflow/lite/delegates/gpu/gl:common_cc_fbs_includes",
    ],
 )
--- a/tensorflow/lite/delegates/gpu/gl/api.cc
+++ b/tensorflow/lite/delegates/gpu/gl/api.cc
@ -0,0 +1,418 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/gl/api.h"
 #include <algorithm>
 #include <cstdint>
 #include <deque>
 #include <mutex>  // NOLINT
 #include <unordered_map>
 #include <vector>
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/common/util.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_call.h"
 #include "tensorflow/lite/delegates/gpu/gl/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/gl/object.h"
 #include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
 #include "tensorflow/lite/delegates/gpu/gl/runtime.h"
 #ifndef TFLITE_GPU_BINARY_RELEASE
 #include "tensorflow/lite/delegates/gpu/gl/serialization.h"
 #endif  // TFLITE_GPU_BINARY_RELEASE
 namespace tflite {
 namespace gpu {
 namespace gl {
 namespace {
 using ObjectsSizes = std::unordered_map<ValueId, size_t>;
 enum class InferenceContextState {
  NOT_STARTED,
  IN_PROGRESS,
 };
 class InferenceContextImpl : public InferenceContext {
 public:
  explicit InferenceContextImpl(std::unique_ptr<Runtime> runtime)
      : runtime_(std::move(runtime)) {}
  Status Execute() final {
    std::lock_guard<std::mutex> lock(guard_);
    if (state_ != InferenceContextState::NOT_STARTED) {
      return FailedPreconditionError("InferenceContext is not reset");
    }
    state_ = InferenceContextState::IN_PROGRESS;
    return runtime_->Execute();
  }
  Status Reset() final {
    std::lock_guard<std::mutex> lock(guard_);
    // TODO(akulik): should Reset not return Status?
    state_ = InferenceContextState::NOT_STARTED;
    return OkStatus();
  }
  RuntimeStats stats() const final { return runtime_->stats(); }
 private:
  std::unique_ptr<Runtime> runtime_;
  mutable std::mutex guard_;
  InferenceContextState state_ = InferenceContextState::NOT_STARTED;
 };
 class InferenceContextWithBatchImpl : public InferenceContext {
 public:
  InferenceContextWithBatchImpl(const ObjectsSizes& sizes,
                                const ObjectManager* objects,
                                std::unique_ptr<ObjectManager> refs,
                                std::unique_ptr<Runtime> runtime)
      : sizes_(sizes),
        objects_(objects),
        refs_(std::move(refs)),
        runtime_(std::move(runtime)) {}
  Status Execute() final {
    std::lock_guard<std::mutex> lock(guard_);
    if (state_ != InferenceContextState::NOT_STARTED) {
      return FailedPreconditionError("InferenceContext is not reset");
    }
    state_ = InferenceContextState::IN_PROGRESS;
    // Calculate expected number of batches and check that all external objects
    // match that number.
    int num_batches = 0;
    for (const auto& s : sizes_) {
      const ValueId id = s.first;
      const size_t byte_size = s.second;
      auto buffer = objects_->FindBuffer(id);
      if (!buffer) continue;
      if (buffer->bytes_size() % byte_size) {
        return InvalidArgumentError(absl::StrCat(
            "Object ", id, " does not match expected byte size: ", byte_size));
      }
      size_t b = buffer->bytes_size() / byte_size;
      if (num_batches == 0) {
        num_batches = b;
      } else {
        if (num_batches != b) {
          return InvalidArgumentError(absl::StrCat(
              "Object ", id, " size does not match expected batch size: ", b,
              " vs ", num_batches));
        }
      }
    }
    for (size_t b = 0; b < num_batches; ++b) {
      // slice external objects by batch.
      for (const auto& s : sizes_) {
        const ValueId id = s.first;
        const size_t byte_size = s.second;
        auto buffer = objects_->FindBuffer(id);
        if (buffer) {
          auto ref = refs_->FindBuffer(id);
          if (!ref) {
            return InvalidArgumentError(
                absl::StrCat("Reference to ", id, " is not found"));
          }
          RETURN_IF_ERROR(buffer->MakeView(b * byte_size, byte_size, ref));
        }
      }
      RETURN_IF_ERROR(runtime_->Execute());
    }
    return OkStatus();
  }
  Status Reset() final {
    std::lock_guard<std::mutex> lock(guard_);
    state_ = InferenceContextState::NOT_STARTED;
    // TODO(akulik): should Reset not return Status?
    return OkStatus();
  }
  RuntimeStats stats() const final { return runtime_->stats(); }
 private:
  const ObjectsSizes sizes_;
  const ObjectManager* objects_;
  // view over external objects provided by a user.
  std::unique_ptr<ObjectManager> refs_;
  std::unique_ptr<Runtime> runtime_;
  mutable std::mutex guard_;
  InferenceContextState state_ = InferenceContextState::NOT_STARTED;
 };
 struct ProgramParameters {
  // A list of uniform parameters to be set.
  std::vector<UniformParameter> parameters;
  // A list of objects to bind to opengl program.
  std::vector<Object> objects;
  uint3 workgroup_size;
  uint3 num_workgroups;
  size_t shader_idx;
 };
 std::string GetShaderHeader(uint3 localsize) {
  return absl::StrCat("#version 310 es\nlayout(local_size_x = ", localsize.x,
                      ", local_size_y = ", localsize.y,
                      ", local_size_z = ", localsize.z, ") in;\n");
 }
 class CompiledModelImpl
 #ifndef TFLITE_GPU_BINARY_RELEASE
    : public CompiledModel,
      public DeserializationHandler {
 #else
    : public CompiledModel {
 #endif  // TFLITE_GPU_BINARY_RELEASE
 public:
  explicit CompiledModelImpl(const GpuInfo& gpu_info) : gpu_info_(gpu_info) {}
  // Called while compiling shaders from scratch
  Status Add(const WorkgroupsCalculator& workgroup_calculator,
             ShaderCode code) {
    // Calculate workgroup size.
    uint3 workgroup_size = workgroup_calculator.Calculate(code);
    uint3 num_workgroups = IntegralDivideRoundUp(code.workload, workgroup_size);
    for (const auto& object : code.objects) {
      if (IsRef(object)) {
        object_sizes_[GetRef(object)] = ByteSizeOf(object);
      }
    }
    // Store full shader and compile it if necessary.
    size_t shader_idx;
    RETURN_IF_ERROR(
        AddFullShader(code.source_code, workgroup_size, &shader_idx));
    programs_.push_back({
        std::move(code.parameters),
        std::move(code.objects),
        workgroup_size,
        num_workgroups,
        shader_idx,
    });
    return OkStatus();
  }
  // Store full shader and compile it if necessary.
  // Returns full_shader_index
  Status AddFullShader(const std::string& partial_shader,
                       const uint3& workgroup_size, size_t* size) {
    std::string shader_src = GetShaderHeader(workgroup_size) + partial_shader;
    auto it = shader_to_index_.find(shader_src);
    if (it == shader_to_index_.end()) {
      GlShader shader;
      RETURN_IF_ERROR(
          GlShader::CompileShader(GL_COMPUTE_SHADER, shader_src, &shader));
      shaders_.push_back(std::move(shader));
      shader_to_index_.insert({shader_src, shader_to_index_.size()});
      *size = shader_to_index_.size() - 1;
    } else {
      *size = it->second;
    }
    return OkStatus();
  }
  Status NewRun(
      const RuntimeOptions& options, const ObjectManager* objects,
      CommandQueue* command_queue,
      std::unique_ptr<InferenceContext>* inference_context) const final {
    std::unique_ptr<ObjectManager> refs;
    if (dynamic_batch_) {
      // Runtime is using objects from refs that will point to provided objects.
      // At this point just create 0 batch slice references.
      refs = absl::make_unique<ObjectManager>();
      for (const auto& s : object_sizes_) {
        auto buffer = objects->FindBuffer(s.first);
        if (!buffer) continue;
        GlBuffer ref;
        RETURN_IF_ERROR(buffer->MakeView(0, s.second, &ref));
        RETURN_IF_ERROR(refs->RegisterBuffer(s.first, std::move(ref)));
      }
    }
    auto runtime = absl::make_unique<Runtime>(options, gpu_info_, command_queue,
                                              (refs ? refs.get() : objects));
    for (auto& c : programs_) {
      RETURN_IF_ERROR(runtime->AddProgram(shaders_[c.shader_idx], c.parameters,
                                          c.objects, c.num_workgroups));
    }
    RETURN_IF_ERROR(runtime->PrepareForExecution());
    if (dynamic_batch_) {
      *inference_context = absl::make_unique<InferenceContextWithBatchImpl>(
          object_sizes_, objects, std::move(refs), std::move(runtime));
    } else {
      *inference_context =
          absl::make_unique<InferenceContextImpl>(std::move(runtime));
    }
    return OkStatus();
  }
 #ifndef TFLITE_GPU_BINARY_RELEASE
  // Called on deserialization
  Status OnProgram(const std::vector<UniformParameter>& parameters,
                   const std::vector<Object>& objects,
                   const uint3& workgroup_size, const uint3& num_workgroups,
                   size_t partial_shader_index) final {
    for (auto& object : objects) {
      if (IsRef(object)) {
        object_sizes_[GetRef(object)] = ByteSizeOf(object);
      }
    }
    size_t shader_idx;
    RETURN_IF_ERROR(AddFullShader(partial_shaders_[partial_shader_index],
                                  workgroup_size, &shader_idx));
    programs_.push_back({
        parameters,
        objects,
        workgroup_size,
        num_workgroups,
        shader_idx,
    });
    return OkStatus();
  }
  Status Serialize(
      std::vector<uint8_t>* serialized_compiled_model) const final {
    SerializedCompiledModelBuilder builder;
    // sort shaders first. They need to be serialized in order.
    std::vector<std::string> full_shaders(shaders_.size());
    for (const auto& shader : shader_to_index_) {
      full_shaders[shader.second] = shader.first;
    }
    std::unordered_map<std::string, size_t> partial_shader_to_index;
    std::vector<std::string> partial_shaders;
    for (const auto& program : programs_) {
      // Remove a header from a shader.
      std::string shader_without_header = full_shaders[program.shader_idx];
      shader_without_header.erase(0, shader_without_header.find("in;") + 3);
      // Insert shader into partial shaders array.
      auto it = partial_shader_to_index.find(shader_without_header);
      size_t shader_idx;
      if (it == partial_shader_to_index.end()) {
        shader_idx = partial_shaders.size();
        partial_shaders.push_back(shader_without_header);
        builder.AddShader(shader_without_header);
        partial_shader_to_index.insert({shader_without_header, shader_idx});
      } else {
        shader_idx = it->second;
      }
      builder.AddProgram(program.parameters, program.objects,
                         program.workgroup_size, program.num_workgroups,
                         shader_idx);
    }
    CompiledModelOptions options;
    options.dynamic_batch = dynamic_batch_;
    auto data = builder.Finalize(options);
    serialized_compiled_model->insert(serialized_compiled_model->end(),
                                      data.begin(), data.end());
    return OkStatus();
  }
  Status OnShader(absl::Span<const char> shader_src) final {
    std::string source(shader_src.data(), shader_src.size());
    partial_shaders_.push_back(source);
    return OkStatus();
  }
  void OnOptions(const CompiledModelOptions& options) final {
    dynamic_batch_ = options.dynamic_batch;
  }
 #endif  // TFLITE_GPU_BINARY_RELEASE
  CompilerStats stats() const final { return stats_; }
  void set_dynamic_batch(bool dynamic_batch) { dynamic_batch_ = dynamic_batch; }
 private:
  const GpuInfo gpu_info_;
  bool dynamic_batch_ = false;
  std::vector<std::string> partial_shaders_;
  std::vector<GlShader> shaders_;
  // Shaders are serialized in order of their indices.
  std::unordered_map<std::string, size_t> shader_to_index_;
  std::deque<ProgramParameters> programs_;
  std::unordered_map<ValueId, size_t> object_sizes_;
  CompilerStats stats_;
 };
 // @return true if all tensors have same batch value.
 bool IsBatchMatchesForAllValues(const GraphFloat32& model) {
  int32_t b = model.values()[0]->tensor.shape.b;
  for (auto value : model.values()) {
    if (value->tensor.shape.b != b) {
      return false;
    }
  }
  return true;
 }
 }  // namespace
 Status Compile(const CompilationOptions& options, const GraphFloat32& model,
               const NodeShader& node_shader,
               const WorkgroupsCalculator& workgroup_calculator,
               std::unique_ptr<CompiledModel>* compiled_model) {
  if (!IsBatchMatchesForAllValues(model)) {
    return InvalidArgumentError("Only identical batch dimension is supported");
  }
  GpuInfo gpu_info;
  RETURN_IF_ERROR(RequestGpuInfo(&gpu_info));
  auto compiled_model_impl = absl::make_unique<CompiledModelImpl>(gpu_info);
  compiled_model_impl->set_dynamic_batch(options.dynamic_batch);
  auto compiler = NewCompiler(&node_shader, &gpu_info, options);
  RETURN_IF_ERROR(compiler->Compile(model, [&](ShaderCode code) -> Status {
    return compiled_model_impl->Add(workgroup_calculator, std::move(code));
  }));
  *compiled_model = std::move(compiled_model_impl);
  return OkStatus();
 }
 #ifndef TFLITE_GPU_BINARY_RELEASE
 Status ReadSerializedModel(const std::vector<uint8_t>& serialized_model,
                           std::unique_ptr<CompiledModel>* compiled_model) {
  GpuInfo gpu_info;
  RETURN_IF_ERROR(RequestGpuInfo(&gpu_info));
  auto compiled_model_impl = absl::make_unique<CompiledModelImpl>(gpu_info);
  RETURN_IF_ERROR(DeserializeCompiledModel(
      absl::MakeConstSpan(serialized_model), compiled_model_impl.get()));
  *compiled_model = std::move(compiled_model_impl);
  return OkStatus();
 }
 #endif  // TFLITE_GPU_BINARY_RELEASE
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/gl/api.h
+++ b/tensorflow/lite/delegates/gpu/gl/api.h
@ -0,0 +1,103 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_API_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_API_H_
 #include <cstdint>
 #include <functional>
 #include <memory>
 #include <vector>
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/gl/command_queue.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler_options.h"
 #include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
 #include "tensorflow/lite/delegates/gpu/gl/object_manager.h"
 #include "tensorflow/lite/delegates/gpu/gl/runtime_options.h"
 #include "tensorflow/lite/delegates/gpu/gl/stats.h"
 #include "tensorflow/lite/delegates/gpu/gl/workgroups/calculator.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 class InferenceContext;
 // Represents a model that was prepared for execution. It is stored in a format
 // most suitable for execution and optionally may include pre-generated or
 // pre-compiled GPU shaders or whatever is needed for efficient execution.
 class CompiledModel {
 public:
  virtual ~CompiledModel() = default;
  virtual CompilerStats stats() const = 0;
  // Creates new inference context. Result can outlive @this.
  //
  // NewRun call as well as subsequent calls to InferenceContext methods should
  // be done from the same EGL context.
  virtual Status NewRun(
      const RuntimeOptions& options, const ObjectManager* objects,
      CommandQueue* command_queue,
      std::unique_ptr<InferenceContext>* inference_context) const = 0;
 #ifndef TFLITE_GPU_BINARY_RELEASE
  // Serializes compiled model to a string.
  // @return true if serialization finished successfully.
  virtual Status Serialize(
      std::vector<uint8_t>* serialized_compiled_model) const = 0;
 #endif  // TFLITE_GPU_BINARY_RELEASE
 };
 // Turns the given model into "compiled" form that is suitable for inference.
 Status Compile(const CompilationOptions& options, const GraphFloat32& model,
               const NodeShader& node_shader,
               const WorkgroupsCalculator& workgroup_calculator,
               std::unique_ptr<CompiledModel>* compiled_model);
 #ifndef TFLITE_GPU_BINARY_RELEASE
 // Reads serialized representation previously created with
 // CompiledModel::Serialize call.
 Status ReadSerializedModel(const std::vector<uint8_t>& serialized_model,
                           std::unique_ptr<CompiledModel>* compiled_model);
 #endif  // TFLITE_GPU_BINARY_RELEASE
 // Encapsulates everything needed for one or more inference executions done
 // sequentially.
 //
 // Thread-safe.
 class InferenceContext {
 public:
  virtual ~InferenceContext() = default;
  virtual RuntimeStats stats() const = 0;
  // Executes inference.
  virtual Status Execute() = 0;
  // Asks context to reset it for another round. Keep in mind that does not
  // affect inputs nor outputs which are not cleared, so it is possible to
  // re-use them.
  // It is an error to call Reset while previous run is still in progress.
  virtual Status Reset() = 0;
 };
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_API_H_
--- a/tensorflow/lite/delegates/gpu/gl/command_queue.cc
+++ b/tensorflow/lite/delegates/gpu/gl/command_queue.cc
@ -0,0 +1,85 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/gl/command_queue.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_call.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_sync.h"
 #include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 namespace {
 class DefaultCommandQueue : public CommandQueue {
 public:
  Status Dispatch(const GlProgram& program, const uint3& workgroups) override {
    RETURN_IF_ERROR(program.Dispatch(workgroups));
    return TFLITE_GPU_CALL_GL(glMemoryBarrier, GL_ALL_BARRIER_BITS);
  }
  Status WaitForCompletion() override {
    // TODO(akulik): may be let a user to choose what wait method to use.
    return GlActiveSyncWait();
  }
 };
 // On Adreno do flush periodically as this affects performance. Command queue
 // needs to be manually managed to ensure that accumulated work goes to GPU as
 // fast as it can.
 //
 // Also, on older Adreno devices glFlush is required after every memory barrier
 // to avoid hitting GPU driver bug.
 class AdrenoCommandQueue : public DefaultCommandQueue {
 public:
  explicit AdrenoCommandQueue(int flush_every_n)
      : flush_every_n_(flush_every_n) {}
  Status Dispatch(const GlProgram& program, const uint3& workgroups) final {
    RETURN_IF_ERROR(DefaultCommandQueue::Dispatch(program, workgroups));
    if ((++program_counter_ % flush_every_n_) == 0) {
      glFlush();
    }
    return OkStatus();
  }
 private:
  const int flush_every_n_;
  int program_counter_ = 0;
 };
 }  // namespace
 std::unique_ptr<CommandQueue> NewCommandQueue(const GpuInfo& gpu_info) {
  if (gpu_info.type == GpuType::ADRENO) {
    int flush_every_n = 1;
    // On Adreno 630 and Adreno 505 there is up to 2x performance boost when
    // glFlush happens not so often.
    if (gpu_info.gpu_model == GpuModel::ADRENO630 ||
        gpu_info.gpu_model == GpuModel::ADRENO505) {
      flush_every_n = 10;
    }
    return absl::make_unique<AdrenoCommandQueue>(flush_every_n);
  }
  return absl::make_unique<DefaultCommandQueue>();
 }
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/gl/command_queue.h
+++ b/tensorflow/lite/delegates/gpu/gl/command_queue.h
@ -0,0 +1,52 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMMAND_QUEUE_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMMAND_QUEUE_H_
 #include <memory>
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_program.h"
 #include "tensorflow/lite/delegates/gpu/gl/gpu_info.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 // GL programs can be executed directly via dispatch call or using a queue
 // abstraction similar to one in OpenCL and Vulkan.
 // CommandQueue executes given programs in order as they come.
 class CommandQueue {
 public:
  virtual ~CommandQueue() = default;
  // Dispatches a program. It may or may not call glFlush.
  virtual Status Dispatch(const GlProgram& program,
                          const uint3& workgroups) = 0;
  // Waits until all programs dispatched prior this call are completed.
  virtual Status WaitForCompletion() = 0;
 };
 // By default memory barrier is inserted after every dispatch.
 std::unique_ptr<CommandQueue> NewCommandQueue(const GpuInfo& gpu_info);
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_COMMAND_QUEUE_H_
--- a/tensorflow/lite/delegates/gpu/gl/common.fbs
+++ b/tensorflow/lite/delegates/gpu/gl/common.fbs
@ -0,0 +1,16 @@
 namespace tflite.gpu.gl.data;
 table Uint3 {
  x:uint32;
  y:uint32;
  z:uint32;
 }
 table Uint2 {
  x:uint32;
  y:uint32;
 }
 table Uint1 {
  x:uint32;
 }
--- a/tensorflow/lite/delegates/gpu/gl/compiled_model.fbs
+++ b/tensorflow/lite/delegates/gpu/gl/compiled_model.fbs
@ -0,0 +1,155 @@
 include "common.fbs";
 namespace tflite.gpu.gl.data;
 file_identifier "AFCM";
 file_extension "flow";
 // Encapsulates entire OpenGL program with all necessary dependencies and
 // parameters.
 table Program {
  // A collection of objects this program refers to.
  objects:[Object];
  // Uniform parameters to be set before execution.
  parameters:[UniformParameter];
  // Defines the number of work groups.
  number_workgroups:Uint3;
  // Defines the size of a workgroup.
  workgroup_size:Uint3;
  // Reference to a shader in this compiled model.
  shader_index:uint32;
  // Contains binary code that was once created after successful shader
  // compilation. Normally it is much faster to instantiate a program from
  // compiled binary.
  binary:ProgramBinary;
 }
 // Compiled binary representation of a program.
 table ProgramBinary {
  format:uint32;  // GLenum
  // Compiled binary shader blob extracted from GL.
  binary:[ubyte];
 }
 enum ParameterType : byte {
  INT32 = 0,
  UINT32 = 1,
  FLOAT32 = 2,
  INT32_2 = 3,
 }
 enum DataType : byte {
  UNKNOWN = 0,
  FLOAT32 = 1,
  FLOAT16 = 2,
  INT32 = 3,
  INT16 = 4,
 }
 union DataVariant {
  DataInt32,
  DataFloat,
  DataUint32,
 }
 table DataFloat {
  data:[float];
 }
 table DataInt32 {
  data:[int32];
 }
 table DataUint32 {
  data:[uint32];
 }
 table UniformParameter {
  name:string;
  type:ParameterType;
  // Data is optional. If it is known in advance, it is encoded here, otherwise
  // a parameter will be set in runtime.
  data:DataVariant;
 }
 enum AccessType : byte {
  READ = 0,
  WRITE = 1,
  READ_WRITE = 2,
 }
 enum ObjectType : byte {
  UNKNOWN = 0,
  BUFFER = 1,
  TEXTURE = 2,
 }
 union ObjectVariant {
  ObjectData,
  ObjectRef,
 }
 union ObjectSize {
  Uint1,
  Uint2,
  Uint3,
 }
 table Object {
  access:AccessType;
  binding:uint32;
  data_type:DataType;
  type:ObjectType;
  size:ObjectSize;
  object:ObjectVariant;
 }
 // Represents a reference to another object provided by object manager.
 table ObjectRef {
  // Unique global identifier to be used by an object manager to lookup this
  // buffer.
  global_id:uint32;
 }
 table ObjectData {
  data:[uint8];
 }
 // Represents entire model as a collection of programs, inputs and outputs.
 table CompiledModel {
  parameters:Parameters;
  // A collection of shaders used by programs.
  shaders:[string];
  // A collection of programs that need to be executed in the same order.
  programs:[Program];
 }
 table Parameters {
  // indicated flow engine version that compiled this model. If engine version
  // does not match compiled model, then a model need to be recompiled. 
  // version:uint32; // not implemented
  // Could potentially be used to track environment when a model was compiled
  // and detect whether it was changed and model recompilation is needed.
  // environment_hash:uint32; // not implemented
  dynamic_batch:bool;
 }
 root_type CompiledModel;
--- a/tensorflow/lite/delegates/gpu/gl/compiler.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler.cc
@ -0,0 +1,295 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/gl/compiler.h"
 #include <algorithm>
 #include <string>
 #include <utility>
 #include <vector>
 #include "absl/memory/memory.h"
 #include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/compiled_node.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/fuse_auto_input.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/fuse_inline.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/fuse_inplace.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.h"
 #include "tensorflow/lite/delegates/gpu/gl/float16_conversions.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 namespace {
 struct ExceedSizeChecker {
  bool operator()(uint32_t v) const { return v > max_size; }
  bool operator()(const uint2& v) const {
    return v.x > max_size || v.y > max_size;
  }
  bool operator()(const uint3& v) const {
    return v.x > max_size || v.y > max_size || v.z > max_z_size;
  }
  int max_size;
  int max_z_size;
 };
 // Returns true if any size variable exceeds the given limit
 bool ExceedsMaxSize(const Object& object, const GpuInfo& gpu_info) {
  return absl::visit(ExceedSizeChecker{gpu_info.max_texture_size,
                                       gpu_info.max_array_texture_layers},
                     object.size);
 }
 ObjectType ChooseFastestObjectType(const GpuInfo& gpu_info) {
  return gpu_info.type == GpuType::ADRENO ? ObjectType::TEXTURE
                                          : ObjectType::BUFFER;
 }
 ObjectType ChooseFastestRefObjectType(const GpuInfo& gpu_info,
                                      const CompilationOptions& options) {
  if (gpu_info.type != GpuType::ADRENO) {
    return ObjectType::BUFFER;
  }
  switch (gpu_info.gpu_model) {
    case GpuModel::ADRENO630:
      return ObjectType::TEXTURE;
    default:
      return options.allow_precision_loss ? ObjectType::TEXTURE
                                          : ObjectType::BUFFER;
  }
 }
 // Compiler executes the following steps:
 //   1. Runs NodeShader for every node in the input graph.
 //   2. Creates a compiled graph that mirrors the input graph and keeps
 //      GeneratedCode in operation's attributes.
 //   3. Fuses nodes in the compiled graph.
 //   4. Generates the full shader code using the nodes in the compiled graph.
 class CompilerImpl : public Compiler {
 public:
  // We use const GpuInfo* because it doesn't let you assign temporary object
  CompilerImpl(const NodeShader* node_shader, const GpuInfo* gpu_info,
               const CompilationOptions& options)
      : node_shader_(*node_shader), gpu_info_(*gpu_info), options_(options) {
    if (options_.preferred_obj_type == ObjectType::UNKNOWN) {
      options_.preferred_obj_type = ChooseFastestObjectType(*gpu_info);
    }
    if (options_.ref_obj_type == ObjectType::UNKNOWN) {
      options_.ref_obj_type = ChooseFastestRefObjectType(*gpu_info, options);
    }
  }
  Status Compile(const GraphFloat32& graph,
                 const ShaderCodeCallback& callback) final {
    // It is important to have ids in a compiled graph identical to the given
    // graph.
    RETURN_IF_ERROR(graph.MakeExactCopy(&compiled_graph_));
    // Clear out batch dimension for dynamic batch support.
    if (options_.dynamic_batch) {
      for (auto value : compiled_graph_.values()) {
        value->tensor.shape.b = 1;
      }
    }
    // Generate a shader for a node and all input/output objects.
    for (auto node : compiled_graph_.nodes()) {
      CompiledNodeAttributes attr;
      attr.node_indices.push_back(node->id);
      RETURN_IF_ERROR(node_shader_.GenerateCode(
          {&compiled_graph_, &gpu_info_, node, options_}, &attr.code));
      node->operation.attributes = std::move(attr);
    }
    ModelTransformer transformer(&compiled_graph_, nullptr);
    if (options_.fuse_operations) {
      FuseAutoOutputWithInline fuse_inline;
      if (!transformer.Apply("fuse_auto_with_inline", &fuse_inline)) {
        return InternalError("fuse_auto_with_inline failed");
      }
      FuseInplaceUpdate fuse_inplace;
      if (!transformer.Apply("fuse_inplace_update", &fuse_inplace)) {
        return InternalError("fuse_inplace failed");
      }
      if (options_.auto_input_fusion) {
        FuseAutoInput fuse_auto_input;
        if (!transformer.Apply("fuse_auto_input", &fuse_auto_input)) {
          return InternalError("fuse_auto_input failed");
        }
      }
    }
    RemoveUnusedInplaceUpdates remove_inplace_updates;
    if (!transformer.Apply("remove_inplace_updates", &remove_inplace_updates)) {
      return InternalError("remove_inplace_updates failed");
    }
    // Prepare internal objects.
    std::unordered_map<ValueId, Object> objects;
    for (auto value : compiled_graph_.values()) {
      Object object = MakePHWC4Ref(value->id, value->tensor.shape);
      object.data_type = value->tensor.type;
      // External references may not be upgraded to f16 nor be represented as
      // textures.
      bool is_external =
          graph.IsGraphOutput(value->id) || graph.IsGraphInput(value->id);
      if (is_external) {
        object.object_type = ObjectType::BUFFER;
      } else {
        if (options_.allow_precision_loss) {
          MaybeConvertToFloat16(&object);
        }
      }
      objects[value->id] = std::move(object);
    }
    // Prepare readonly objects and check whether object types are supported.
    for (auto node : compiled_graph_.nodes()) {
      auto& attr =
          absl::any_cast<CompiledNodeAttributes&>(node->operation.attributes);
      // Set workload explicitly.
      if (attr.code.workload == uint3()) {
        auto outputs = compiled_graph_.FindOutputs(node->id);
        auto shape = outputs[0]->tensor.shape;
        for (auto output : outputs) {
          if (shape != output->tensor.shape) {
            return FailedPreconditionError(
                "Workload uint3() requires all output sizes to match");
          }
        }
        attr.code.workload =
            uint3(shape.w, shape.h, IntegralDivideRoundUp(shape.c, 4));
      }
      int num_textures = 0;
      // Counts number of used textures and chooses ObjectType for an object.
      auto set_object_type = [&](Object* object) {
        if (object->object_type == ObjectType::BUFFER) {
          // Don't change from buffer once it is set.
          return;
        }
        bool is_ref = IsRef(*object);
        if (num_textures < gpu_info_.max_image_units &&
            !ExceedsMaxSize(*object, gpu_info_) &&
            (object->object_type == ObjectType::TEXTURE ||
             (is_ref && options_.ref_obj_type == ObjectType::TEXTURE) ||
             (!is_ref && options_.preferred_obj_type == ObjectType::TEXTURE))) {
          object->object_type = ObjectType::TEXTURE;
          num_textures++;
        } else {
          object->object_type = ObjectType::BUFFER;
        }
      };
      for (auto& object : attr.code.objects) {
        // Downgrade readonly objects to F16 is requested.
        if (options_.allow_precision_loss) {
          MaybeConvertToFloat16(&object.second);
        }
        set_object_type(&object.second);
      }
      for (auto ref : compiled_graph_.FindInputs(node->id)) {
        set_object_type(&objects[ref->id]);
      }
      for (auto ref : compiled_graph_.FindOutputs(node->id)) {
        set_object_type(&objects[ref->id]);
      }
    }
    // Generate shaders from the transformed graph.
    ShaderCodegen codegen(options_, gpu_info_);
    for (auto node : compiled_graph_.nodes()) {
      auto& attr =
          absl::any_cast<CompiledNodeAttributes&>(node->operation.attributes);
      if (attr.code.source_code.empty()) {
        // noop. Skip this node.
        continue;
      }
      // Declare inputs and outputs explicitly.
      for (auto ref : compiled_graph_.FindInputs(node->id)) {
        auto object = objects[ref->id];
        object.access = AccessType::READ;
        attr.inputs.push_back(object);
      }
      for (auto ref : compiled_graph_.FindOutputs(node->id)) {
        auto object = objects[ref->id];
        object.access = AccessType::WRITE;
        attr.outputs.push_back(object);
      }
      // Allocate bindings. Textures must be bound first. max_image_units also
      // defines max binding number for a texture.
      uint32_t binding = 0;
      auto set_binding = [&](ObjectType type, Object& object) {
        if (object.object_type == type) {
          object.binding = binding++;
        }
      };
      for (auto& object : attr.inputs) {
        set_binding(ObjectType::TEXTURE, object);
      }
      for (auto& object : attr.outputs) {
        set_binding(ObjectType::TEXTURE, object);
      }
      for (auto& object : attr.code.objects) {
        set_binding(ObjectType::TEXTURE, object.second);
      }
      for (auto& object : attr.inputs) {
        set_binding(ObjectType::BUFFER, object);
      }
      for (auto& object : attr.outputs) {
        set_binding(ObjectType::BUFFER, object);
      }
      for (auto& object : attr.code.objects) {
        set_binding(ObjectType::BUFFER, object.second);
      }
      // Generate source code.
      ShaderCode shader_code;
      RETURN_IF_ERROR(codegen.Build(std::move(attr), &shader_code));
      RETURN_IF_ERROR(callback(std::move(shader_code)));
    }
    return OkStatus();
  }
 private:
  const NodeShader& node_shader_;
  const GpuInfo& gpu_info_;
  CompilationOptions options_;
  GraphFloat32 compiled_graph_;
 };
 }  // namespace
 std::unique_ptr<Compiler> NewCompiler(const NodeShader* node_shader,
                                      const GpuInfo* gpu_info,
                                      const CompilationOptions& options) {
  return absl::make_unique<CompilerImpl>(node_shader, gpu_info, options);
 }
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/gl/compiler.h
+++ b/tensorflow/lite/delegates/gpu/gl/compiler.h
@ -0,0 +1,54 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_H_
 #include <functional>
 #include <memory>
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/shader_code.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler_options.h"
 #include "tensorflow/lite/delegates/gpu/gl/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 using ShaderCodeCallback = std::function<Status(ShaderCode code)>;
 class Compiler {
 public:
  virtual ~Compiler() = default;
  // Goes over a graph and generates OpenGL shaders for the given graph.
  // Callback is called for every generated shader. Callback may execute shaders
  // as they come or store them elsewhere to execute later.
  virtual Status Compile(const GraphFloat32& graph,
                         const ShaderCodeCallback& callback) = 0;
 };
 std::unique_ptr<Compiler> NewCompiler(
    const NodeShader* node_shader, const GpuInfo* gpu_info,
    const CompilationOptions& options);
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_H_
--- a/tensorflow/lite/delegates/gpu/gl/compiler/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/BUILD
@ -0,0 +1,192 @@
 package(default_visibility = ["//visibility:public"])
 licenses(["notice"])  # Apache 2.0
 cc_library(
    name = "preprocessor",
    srcs = ["preprocessor.cc"],
    hdrs = ["preprocessor.h"],
    deps = [
        "//tensorflow/lite/delegates/gpu/common:status",
        "@com_google_absl//absl/strings",
    ],
 )
 cc_test(
    name = "preprocessor_test",
    srcs = ["preprocessor_test.cc"],
    tags = [
        "local",
    ],
    deps = [
        ":preprocessor",
        "@com_google_googletest//:gtest_main",
    ],
 )
 cc_library(
    name = "parameter_accessor",
    srcs = ["parameter_accessor.cc"],
    hdrs = ["parameter_accessor.h"],
    deps = [
        ":preprocessor",
        "//tensorflow/lite/delegates/gpu/common:types",
        "//tensorflow/lite/delegates/gpu/gl:uniform_parameter",
        "@com_google_absl//absl/strings",
        "@com_google_absl//absl/strings:str_format",
        "@com_google_absl//absl/types:variant",
    ],
 )
 cc_test(
    name = "parameter_accessor_test",
    srcs = ["parameter_accessor_test.cc"],
    tags = [
        "local",
    ],
    deps = [
        ":parameter_accessor",
        "//tensorflow/lite/delegates/gpu/common:types",
        "@com_google_googletest//:gtest_main",
    ],
 )
 cc_library(
    name = "object_accessor",
    srcs = ["object_accessor.cc"],
    hdrs = ["object_accessor.h"],
    deps = [
        ":parameter_accessor",
        ":preprocessor",
        "//tensorflow/lite/delegates/gpu/common:data_type",
        "//tensorflow/lite/delegates/gpu/common:types",
        "//tensorflow/lite/delegates/gpu/gl:object",
        "@com_google_absl//absl/strings",
        "@com_google_absl//absl/strings:str_format",
        "@com_google_absl//absl/types:variant",
    ],
 )
 cc_test(
    name = "object_accessor_test",
    srcs = ["object_accessor_test.cc"],
    tags = [
        "local",
    ],
    deps = [
        ":object_accessor",
        ":parameter_accessor",
        "//tensorflow/lite/delegates/gpu/common:types",
        "@com_google_absl//absl/types:variant",
        "@com_google_googletest//:gtest_main",
    ],
 )
 cc_library(
    name = "shader_code",
    hdrs = ["shader_code.h"],
    deps = [
        "//tensorflow/lite/delegates/gpu/common:model",
        "//tensorflow/lite/delegates/gpu/common:types",
        "//tensorflow/lite/delegates/gpu/gl:object",
        "//tensorflow/lite/delegates/gpu/gl:uniform_parameter",
    ],
 )
 cc_library(
    name = "shader_codegen",
    srcs = ["shader_codegen.cc"],
    hdrs = ["shader_codegen.h"],
    deps = [
        ":compiled_node",
        ":object_accessor",
        ":parameter_accessor",
        ":preprocessor",
        ":shader_code",
        "//tensorflow/lite/delegates/gpu/common:model",
        "//tensorflow/lite/delegates/gpu/common:status",
        "//tensorflow/lite/delegates/gpu/gl:compiler_options",
        "//tensorflow/lite/delegates/gpu/gl:gpu_info",
        "//tensorflow/lite/delegates/gpu/gl:object",
        "@com_google_absl//absl/strings",
    ],
 )
 cc_library(
    name = "compiled_node",
    srcs = ["compiled_node.cc"],
    hdrs = ["compiled_node.h"],
    deps = [
        ":rename",
        "//tensorflow/lite/delegates/gpu/common:model",
        "//tensorflow/lite/delegates/gpu/common:status",
        "//tensorflow/lite/delegates/gpu/gl:node_shader",
        "//tensorflow/lite/delegates/gpu/gl:object",
        "@com_google_absl//absl/strings",
    ],
 )
 cc_library(
    name = "fuse_inplace",
    srcs = ["fuse_inplace.cc"],
    hdrs = ["fuse_inplace.h"],
    deps = [
        ":compiled_node",
        ":preprocessor",
        "//tensorflow/lite/delegates/gpu/common:model",
        "//tensorflow/lite/delegates/gpu/common:model_transformer",
        "//tensorflow/lite/delegates/gpu/common:types",
        "//tensorflow/lite/delegates/gpu/gl:node_shader",
        "@com_google_absl//absl/strings",
        "@com_google_absl//absl/types:any",
    ],
 )
 cc_library(
    name = "fuse_inline",
    srcs = ["fuse_inline.cc"],
    hdrs = ["fuse_inline.h"],
    deps = [
        ":compiled_node",
        ":shader_code",
        "//tensorflow/lite/delegates/gpu/common:model",
        "//tensorflow/lite/delegates/gpu/common:model_transformer",
        "//tensorflow/lite/delegates/gpu/common:status",
        "//tensorflow/lite/delegates/gpu/common:types",
        "//tensorflow/lite/delegates/gpu/gl:node_shader",
        "@com_google_absl//absl/strings",
        "@com_google_absl//absl/types:any",
    ],
 )
 cc_library(
    name = "rename",
    srcs = ["rename.cc"],
    hdrs = ["rename.h"],
    deps = [
        ":object_accessor",
        ":parameter_accessor",
        ":preprocessor",
        "//tensorflow/lite/delegates/gpu/common:status",
        "//tensorflow/lite/delegates/gpu/gl:node_shader",
        "//tensorflow/lite/delegates/gpu/gl:object",
        "//tensorflow/lite/delegates/gpu/gl:uniform_parameter",
        "@com_google_absl//absl/strings",
    ],
 )
 cc_library(
    name = "fuse_auto_input",
    srcs = ["fuse_auto_input.cc"],
    hdrs = ["fuse_auto_input.h"],
    deps = [
        ":compiled_node",
        "//tensorflow/lite/delegates/gpu/common:model",
        "//tensorflow/lite/delegates/gpu/common:model_transformer",
        "//tensorflow/lite/delegates/gpu/common:operations",
        "//tensorflow/lite/delegates/gpu/common:types",
        "@com_google_absl//absl/strings",
        "@com_google_absl//absl/types:any",
        "@com_google_absl//absl/types:variant",
    ],
 )
--- a/tensorflow/lite/delegates/gpu/gl/compiler/compiled_node.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/compiled_node.cc
@ -0,0 +1,64 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/gl/compiler/compiled_node.h"
 #include <unordered_set>
 #include "absl/strings/str_cat.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/rename.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 Status MergeCode(CompiledNodeAttributes* attr,
                 CompiledNodeAttributes* merged_attr) {
  // build a map of known names.
  std::unordered_set<std::string> known_names;
  for (const auto& parameter : merged_attr->code.parameters) {
    known_names.insert(parameter.name);
  }
  for (const auto& object : merged_attr->code.objects) {
    known_names.insert(object.first);
  }
  // Rewrite parameters with unique names.
  int index =
      merged_attr->code.parameters.size() + merged_attr->code.objects.size();
  RETURN_IF_ERROR(Rename(
      [&](absl::string_view name) -> std::string {
        std::string n(name.begin(), name.end());
        // if a name is unique, then keep it as is. Otherwise append an unique
        // index.
        if (known_names.find(n) == known_names.end()) {
          return n;
        }
        return absl::StrCat(n, index++);
      },
      &attr->code));
  std::move(attr->code.objects.begin(), attr->code.objects.end(),
            std::back_inserter(merged_attr->code.objects));
  std::move(attr->code.parameters.begin(), attr->code.parameters.end(),
            std::back_inserter(merged_attr->code.parameters));
  std::move(attr->node_indices.begin(), attr->node_indices.end(),
            std::back_inserter(merged_attr->node_indices));
  return OkStatus();
 }
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/gl/compiler/compiled_node.h
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/compiled_node.h
@ -0,0 +1,52 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_COMPILED_NODE_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_COMPILED_NODE_H_
 #include <vector>
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
 #include "tensorflow/lite/delegates/gpu/gl/object.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 // Contains compiler internal attributes for each node after it was processed by
 // NodeShader.
 struct CompiledNodeAttributes {
  std::vector<Object> inputs;
  std::vector<Object> outputs;
  GeneratedCode code;
  // nodes that are covered by the provided shader.
  std::vector<NodeId> node_indices;
 };
 // Moves all code objects, parameters and node indices from attr to merged_attr.
 // Parameters and objects in attr.code.source_code are renamed to ensure
 // uniqueness.
 Status MergeCode(CompiledNodeAttributes* attr,
                 CompiledNodeAttributes* merged_attr);
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_COMPILED_NODE_H_
--- a/tensorflow/lite/delegates/gpu/gl/compiler/fuse_auto_input.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/fuse_auto_input.cc
@ -0,0 +1,228 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/gl/compiler/fuse_auto_input.h"
 #include <string>
 #include <vector>
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_replace.h"
 #include "absl/types/any.h"
 #include "absl/types/variant.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/compiled_node.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 namespace {
 std::pair<std::string, std::string> MakeValueReplacement(int n, int k) {
  return {absl::StrCat("value_", n), absl::StrCat("value_", k)};
 }
 std::pair<std::string, std::string> MakeDataReplacement(int n, int k) {
  return {absl::StrCat("input_data_", n), absl::StrCat("input_data_", k)};
 }
 }  // namespace
 TransformResult FuseAutoInput::ApplyToNode(Node* node, GraphFloat32* graph) {
  auto& node_attr =
      absl::any_cast<CompiledNodeAttributes&>(node->operation.attributes);
  auto& node_code = node_attr.code;
  if (node_code.input != IOStructure::AUTO) {
    return {TransformStatus::SKIPPED, ""};
  }
  uint3 workgroup = node_code.workgroup;
  auto node_outputs = graph->FindOutputs(node->id);
  // Check which inputs could be fused into the current node.
  std::vector<std::pair<Node*, int>> nodes_to_fuse;
  std::vector<std::pair<ValueId, int>> input_values;
  int input_num = -1;
  for (auto input_value : graph->FindInputs(node->id)) {
    input_num++;
    const ValueId input_id = input_value->id;
    input_values.push_back({input_id, input_num});
    if (graph->FindConsumers(input_id).size() > 1) {
      continue;  // input is consumed by >1 nodes
    }
    Node* input_producer = graph->FindProducer(input_id);
    if (input_producer == nullptr) {
      continue;  // graph's input
    }
    if (graph->FindOutputs(input_producer->id).size() != 1) {
      continue;  // input node has more than one output
    }
    auto& input_producer_attr = absl::any_cast<const CompiledNodeAttributes&>(
        input_producer->operation.attributes);
    if (input_producer_attr.code.output != IOStructure::AUTO) {
      continue;
    }
    if (input_producer_attr.code.workload != node_code.workload &&
        uint3() != input_producer_attr.code.workload) {
      continue;
    }
    if (input_producer_attr.code.workgroup != uint3()) {
      // New fused node should fuse only a single shader that has pre-defined
      // workgroup. Such shader is considered "heavy". Do not fuse two heavy
      // shaders into one.
      // TODO(eignasheva): make sure it still works.
      if (workgroup != uint3()) {
        continue;
      }
      workgroup = input_producer_attr.code.workgroup;
    }
    nodes_to_fuse.push_back({input_producer, input_num});
    input_values.pop_back();  // this value will not be used as input.
  }
  if (nodes_to_fuse.empty()) {
    return {TransformStatus::SKIPPED, ""};
  }
  // Break connections between current node and its inputs.
  for (auto value : graph->FindInputs(node->id)) {
    if (!graph->RemoveConsumer(node->id, value->id).ok()) {
      return {TransformStatus::INVALID, ""};
    }
  }
  std::string operation_type;
  std::string source_code;
  std::string values;
  // Node source code need to be appended later to the end.
  std::swap(source_code, node_code.source_code);
  // Indicates value_k that is beyond originally declared [0..n] values,
  // therefore, it can be used by newly added dependencies.
  int extra_input_num = input_num;
  input_num = 0;
  // Fuse all nodes into one.
  for (auto input_and_num : nodes_to_fuse) {
    auto& input = input_and_num.first;
    auto& attr =
        absl::any_cast<CompiledNodeAttributes&>(input->operation.attributes);
    auto super_inputs = graph->FindInputs(input->id);
    // Replace all internal references in the input source code. For example:
    // source code "value_0 = max(0, value_0);" will be rewritten into
    // "value_2 = max(0, value_2);"
    std::vector<std::pair<std::string, std::string>> replacements;
    for (int i = 0; i < super_inputs.size(); ++i) {
      // Node source code uses value_N to access output value from the fused
      // node. Use correct reference.
      //
      // Here value_N does not correspond to input_N anymore. Instead it tracks
      // value_n and input_m independently. Value_index uses an index needed
      // for the "final" shader, while input_num preserves the order of inputs.
      // For example:
      //    Shader A: input_0, input_1
      //    value_0 = value_0 > value_1 ? value_0 : value_1;
      //
      //    Shader B:  input_0
      //    value_0 = max(0, value_0);
      //
      //    AddShader: input_0, input_1
      //    value_0 = value_0 + value_1;
      //
      //    Fused shader is going to have 3 inputs: input_0 (A), input_1 (A),
      //    input_2 (B). But Shader B need to store result in value_1, because
      //    AddShader refers to it as 'value_1'. So, fused shader will look as
      //    follows:
      //
      //    // Shader A
      //    vec4 value_0 = input_data_0.data[gid.x, gid.y, gid.z];
      //    vec4 value_2 = input_data_1.data[gid.x, gid.y, gid.z];
      //    value_0 = value_0 > value_2 ? value_0 : value_2;
      //
      //    // Shader B
      //    vec4 value_1 = input_data_2.data[gid.x, gid.y, gid.z];
      //    value_1 = max(0, value_1);
      //
      //    // AddShader
      //    value_0 = value_0 + value_1;
      //
      //    output_data_0.data[gid.x, gid.y, gid.z] = value_0;
      int value_index = i == 0 ? input_and_num.second : ++extra_input_num;
      replacements.push_back(MakeValueReplacement(i, value_index));
      replacements.push_back(MakeDataReplacement(i, input_num));
      // Declare input values based on the input structure of the merged node.
      // This code copies what shader_codegen would do automatically.
      if (attr.code.input == IOStructure::AUTO) {
        absl::StrAppend(&values, "  value_", value_index, " = $input_data_",
                        input_num, "[gid.x, gid.y, gid.z]$;\n");
      }
      if (!graph->AddConsumer(node->id, super_inputs[i]->id).ok()) {
        return {TransformStatus::INVALID, ""};
      }
      input_num++;
    }
    attr.code.source_code =
        absl::StrReplaceAll(attr.code.source_code, replacements);
    // Merge all objects, parameters and source code.
    if (!MergeCode(&attr, &node_attr).ok()) {
      return {TransformStatus::INVALID, "Unable to merge the code"};
    }
    absl::StrAppend(&node_attr.code.source_code, "{\n", attr.code.source_code,
                    "\n}");
    if (!operation_type.empty()) {
      operation_type += ",";
    }
    operation_type += input->operation.type;
    if (!graph->DeleteNode(input->id).ok()) {
      return {TransformStatus::INVALID, ""};
    }
  }
  // Add back all inputs that are used directly by the fused node.
  for (int i = 0; i < input_values.size(); i++) {
    if (node_code.input == IOStructure::AUTO) {
      absl::StrAppend(&values, "  value_", input_values[i].second,
                      " = $input_data_", input_num,
                      "[gid.x, gid.y, gid.z]$;\n");
    }
    if (!graph->AddConsumer(node->id, input_values[i].first).ok()) {
      return {TransformStatus::INVALID, ""};
    }
    input_num++;
  }
  node_code.input = IOStructure::ONLY_DEFINITIONS;
  absl::StrAppend(&node->operation.type, "(", operation_type, ")");
  node_code.source_code =
      absl::StrCat(values, node_code.source_code, "{//FUSED",
                   node->operation.type, "\n", source_code, "\n}");
  return {TransformStatus::APPLIED, ""};
 }
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/gl/compiler/fuse_auto_input.h
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/fuse_auto_input.h
@ -0,0 +1,49 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_FUSE_AUTO_INPUT_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_FUSE_AUTO_INPUT_H_
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 // Fuses nodes that have auto output with auto input node using the following
 // rules.
 //
 // Source graph:
 //   A B C
 //   \ | /
 //     D
 //
 // - A, B and C each have a single output marked as AUTO
 // - Each output is used only by D
 // - D has all inputs marked as AUTO
 //
 // Result: in the best case a single node that does (A,B,C)+D operations.
 //
 class FuseAutoInput : public NodeTransformation {
 public:
  TransformResult ApplyToNode(Node* node, GraphFloat32* graph) final;
 };
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_FUSE_AUTO_INPUT_H_
--- a/tensorflow/lite/delegates/gpu/gl/compiler/fuse_inline.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/fuse_inline.cc
@ -0,0 +1,78 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/gl/compiler/fuse_inline.h"
 #include <algorithm>
 #include <iterator>
 #include <string>
 #include <vector>
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/compiled_node.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/shader_code.h"
 #include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 TransformResult FuseAutoOutputWithInline::ApplyToNodesSequence(
    const std::vector<Node*>& sequence, GraphFloat32* graph) {
  Node* node1 = sequence.front();
  Node* node2 = sequence.back();
  auto& attr1 =
      absl::any_cast<CompiledNodeAttributes&>(node1->operation.attributes);
  auto& attr2 =
      absl::any_cast<CompiledNodeAttributes&>(node2->operation.attributes);
  if (attr1.code.output != IOStructure::AUTO ||
      graph->FindInputs(node2->id).size() != 1 ||
      graph->FindOutputs(node2->id).size() != 1 ||
      attr2.code.output != IOStructure::AUTO ||
      attr2.code.input != IOStructure::AUTO ||
      (attr1.code.workload != attr2.code.workload &&
       uint3() != attr2.code.workload) ||
      graph->FindOutputs(node1->id).size() !=
          graph->FindInputs(node2->id).size()) {
    return {TransformStatus::SKIPPED, ""};
  }
  // Check if the code was not fused yet, and wrap source code into {}.
  if (node1->operation.type.find('+') == std::string::npos) {
    attr1.code.source_code =
        absl::StrCat("\n{\n", attr1.code.source_code, "\n}\n");
  }
  if (!MergeCode(&attr2, &attr1).ok()) {
    return {TransformStatus::INVALID, "Unable to merge two nodes"};
  }
  absl::StrAppend(&attr1.code.source_code, "{\n", attr2.code.source_code,
                  "\n}");
  node1->operation.type += "+" + node2->operation.type;
  if (!RemoveFollowingNode(graph, node2, node1).ok()) {
    return {TransformStatus::INVALID,
            "Unable to remove node " + std::to_string(node2->id)};
  }
  return {TransformStatus::APPLIED, ""};
 }
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/gl/compiler/fuse_inline.h
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/fuse_inline.h
@ -0,0 +1,57 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_FUSE_INLINE_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_FUSE_INLINE_H_
 #include <vector>
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 // Fuses every two nodes where first node does default output and second node
 // is INLINE.
 //
 // Generates code as follows:
 //   1. all uniforms are inlined
 //   2. source code is wrapped into {}
 // For example:
 //  value = clamp(value, 0.0, clip);
 //  +
 //  value = 1.0 / (1.0 + exp(-1.0 * value));
 // will turn into:
 //  {
 //    value = clamp(value, 0.0, clip);
 //  }
 //  {
 //    value = 1.0 / (1.0 + exp(-1.0 * value));
 //  }
 class FuseAutoOutputWithInline : public SequenceTransformation {
 public:
  int ExpectedSequenceLength() const final { return 2; }
  TransformResult ApplyToNodesSequence(const std::vector<Node*>& sequence,
                                       GraphFloat32* graph) final;
 };
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_FUSE_INLINE_H_
--- a/tensorflow/lite/delegates/gpu/gl/compiler/fuse_inplace.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/fuse_inplace.cc
@ -0,0 +1,151 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/gl/compiler/fuse_inplace.h"
 #include <cstring>
 #include <string>
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/compiled_node.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/preprocessor.h"
 #include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 namespace {
 static const char* kInplacePrefix = "inplace_update:\0";
 class EmptyInplaceRewrite : public InlineRewrite {
 public:
  RewriteStatus Rewrite(absl::string_view input, std::string* output) final {
    if (input.compare(0, strlen(kInplacePrefix), kInplacePrefix) == 0) {
      num_rewrites_++;
      return RewriteStatus::SUCCESS;
    }
    return RewriteStatus::NOT_RECOGNIZED;
  }
  int num_rewrites() const { return num_rewrites_; }
 private:
  int num_rewrites_ = 0;
 };
 // Takes a code as an input. Replaces 'value_0' in the code with a value that
 // comes in a rewrite. For example:
 //   code:    value_0 = max(value_0, 0);
 //   rewrite: inplace_update:result_12 -> result_12 = max(result_12, 0);
 //
 class InplaceCodeRewrite : public InlineRewrite {
 public:
  explicit InplaceCodeRewrite(const std::string& code) : code_(code) {}
  RewriteStatus Rewrite(absl::string_view input, std::string* output) final {
    int len = strlen(kInplacePrefix);
    if (input.compare(0, len, kInplacePrefix) == 0) {
      auto variable_name = input.substr(len);
      absl::StrAppend(output,
                      absl::StrReplaceAll(code_, {{"value_0", variable_name}}));
      return RewriteStatus::SUCCESS;
    }
    return RewriteStatus::NOT_RECOGNIZED;
  }
 private:
  std::string code_;
 };
 }  // namespace
 TransformResult RemoveUnusedInplaceUpdates::ApplyToNode(Node* node,
                                                        GraphFloat32* graph) {
  auto& attr =
      absl::any_cast<CompiledNodeAttributes&>(node->operation.attributes);
  // Remove inplace block by rewriting to empty string.
  EmptyInplaceRewrite rewrite;
  TextPreprocessor preprocessor('$', true);
  preprocessor.AddRewrite(&rewrite);
  if (!preprocessor.Rewrite(attr.code.source_code, &attr.code.source_code)
           .ok()) {
    return {TransformStatus::INVALID, ""};
  }
  return {rewrite.num_rewrites() > 0 ? TransformStatus::APPLIED
                                     : TransformStatus::SKIPPED,
          ""};
 }
 TransformResult FuseInplaceUpdate::ApplyToNodesSequence(
    const std::vector<Node*>& sequence, GraphFloat32* graph) {
  Node* node1 = sequence.front();
  Node* node2 = sequence.back();
  auto& attr1 =
      absl::any_cast<CompiledNodeAttributes&>(node1->operation.attributes);
  auto& attr2 =
      absl::any_cast<CompiledNodeAttributes&>(node2->operation.attributes);
  if (graph->FindInputs(node2->id).size() != 1 ||
      graph->FindOutputs(node2->id).size() != 1 ||
      attr2.code.output != IOStructure::AUTO ||
      attr2.code.input != IOStructure::AUTO ||
      (attr1.code.workload != attr2.code.workload &&
       uint3() != attr2.code.workload)) {
    return {TransformStatus::SKIPPED, ""};
  }
  // First count of replaces that would happen to check whether rewrite is
  // needed.
  {
    EmptyInplaceRewrite counting_rewrite;
    TextPreprocessor preprocessor('$', true);
    preprocessor.AddRewrite(&counting_rewrite);
    std::string temp;
    if (!preprocessor.Rewrite(attr1.code.source_code, &temp).ok()) {
      return {TransformStatus::INVALID, ""};
    }
    // no rewrites in the source code. skip it.
    if (counting_rewrite.num_rewrites() == 0) {
      return {TransformStatus::SKIPPED, ""};
    }
  }
  if (!MergeCode(&attr2, &attr1).ok()) {
    return {TransformStatus::INVALID, "Unable to merge two nodes"};
  }
  TextPreprocessor preprocessor('$', true);
  InplaceCodeRewrite rewrite(attr2.code.source_code);
  preprocessor.AddRewrite(&rewrite);
  if (!preprocessor.Rewrite(attr1.code.source_code, &attr1.code.source_code)
           .ok()) {
    return {TransformStatus::INVALID, ""};
  }
  node1->operation.type += "+" + node2->operation.type;
  if (!RemoveFollowingNode(graph, node2, node1).ok()) {
    return {TransformStatus::INVALID,
            "Unable to remove node " + std::to_string(node2->id)};
  }
  return {TransformStatus::APPLIED, ""};
 }
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/gl/compiler/fuse_inplace.h
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/fuse_inplace.h
@ -0,0 +1,67 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_FUSE_INPLACE_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_FUSE_INPLACE_H_
 #include <vector>
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 // Fuse two shaders where second shader is inline shader with the first.
 // First shader should have a special symbol that defines a place where such
 // fusion should be made and what variable needs to be changed.
 // Second shader needs to operation with 'value_0' variable.
 // Example:
 //
 //  First shader:
 //   vec4 result = input_data_0.data[gid.x, gid.y, gid.z];
 //   $inplace_update:result$
 //   ...
 //   output_data_0.data[1,2,3] = result;
 //
 //  Second shader:
 //   value_0 = max(value_0, 0);
 //
 //  Fused shader:
 //   vec4 result = input_data_0.data[gid.x, gid.y, gid.z];
 //   result = max(result, 0);
 //   ...
 //   output_data_0.data[1,2,3] = result;
 //
 class FuseInplaceUpdate : public SequenceTransformation {
 public:
  int ExpectedSequenceLength() const final { return 2; }
  TransformResult ApplyToNodesSequence(const std::vector<Node*>& sequence,
                                       GraphFloat32* graph) final;
 };
 // Removes all %inplace_update:XXX% strings from the code.
 class RemoveUnusedInplaceUpdates : public NodeTransformation {
 public:
  TransformResult ApplyToNode(Node* node, GraphFloat32* graph) final;
 };
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_FUSE_INPLACE_H_
--- a/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.cc
@ -0,0 +1,546 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
 #include "absl/types/variant.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 namespace object_accessor_internal {
 // Splits name[index1, index2...] into 'name' and {'index1', 'index2'...}.
 IndexedElement ParseElement(absl::string_view input) {
  auto i = input.find('[');
  if (i == std::string::npos || input.back() != ']') {
    return {};
  }
  return {input.substr(0, i),
          absl::StrSplit(input.substr(i + 1, input.size() - i - 2), ',',
                         absl::SkipWhitespace())};
 }
 }  // namespace object_accessor_internal
 namespace {
 void MaybeConvertToHalf(DataType data_type, absl::string_view value,
                        std::string* output) {
  if (data_type == DataType::FLOAT16) {
    absl::StrAppend(output, "Vec4ToHalf(", value, ")");
  } else {
    absl::StrAppend(output, value);
  }
 }
 void MaybeConvertFromHalf(DataType data_type, absl::string_view value,
                          std::string* output) {
  if (data_type == DataType::FLOAT16) {
    absl::StrAppend(output, "Vec4FromHalf(", value, ")");
  } else {
    absl::StrAppend(output, value);
  }
 }
 struct ReadFromTextureGenerator {
  RewriteStatus operator()(uint32_t) const {
    if (element.indices.size() != 1) {
      result->append("WRONG_NUMBER_OF_INDICES");
      return RewriteStatus::ERROR;
    }
    // 1D textures are emulated as 2D textures
    absl::StrAppend(result, "imageLoad(", element.object_name, ", ivec2(",
                    element.indices[0], ", 0))");
    return RewriteStatus::SUCCESS;
  }
  template <typename Shape>
  RewriteStatus operator()(const Shape&) const {
    if (element.indices.size() != Shape::size()) {
      result->append("WRONG_NUMBER_OF_INDICES");
      return RewriteStatus::ERROR;
    }
    absl::StrAppend(result, "imageLoad(", element.object_name, ", ivec",
                    Shape::size(), "(", absl::StrJoin(element.indices, ", "),
                    "))");
    return RewriteStatus::SUCCESS;
  }
  const object_accessor_internal::IndexedElement& element;
  std::string* result;
 };
 struct ReadFromBufferGenerator {
  RewriteStatus operator()(uint32_t) const {
    if (element.indices.size() != 1) {
      result->append("WRONG_NUMBER_OF_INDICES");
      return RewriteStatus::ERROR;
    }
    MaybeConvertFromHalf(
        data_type,
        absl::StrCat(element.object_name, ".data[", element.indices[0], "]"),
        result);
    return RewriteStatus::SUCCESS;
  }
  RewriteStatus operator()(const uint2& size) const {
    if (element.indices.size() == 1) {
      // access by linear index. Use method above to generate accessor.
      return (*this)(1U);
    }
    if (element.indices.size() != 2) {
      result->append("WRONG_NUMBER_OF_INDICES");
      return RewriteStatus::ERROR;
    }
    MaybeConvertFromHalf(
        data_type,
        absl::StrCat(element.object_name, ".data[", element.indices[0], " + $",
                     element.object_name, "_w$ * (", element.indices[1], ")]"),
        result);
    *requires_sizes = true;
    return RewriteStatus::SUCCESS;
  }
  RewriteStatus operator()(const uint3& size) const {
    if (element.indices.size() == 1) {
      // access by linear index. Use method above to generate accessor.
      return (*this)(1U);
    }
    if (element.indices.size() != 3) {
      result->append("WRONG_NUMBER_OF_INDICES");
      return RewriteStatus::ERROR;
    }
    MaybeConvertFromHalf(
        data_type,
        absl::StrCat(element.object_name, ".data[", element.indices[0], " + $",
                     element.object_name, "_w$ * (", element.indices[1], " + $",
                     element.object_name, "_h$ * (", element.indices[2], "))]"),
        result);
    *requires_sizes = true;
    return RewriteStatus::SUCCESS;
  }
  DataType data_type;
  const object_accessor_internal::IndexedElement& element;
  std::string* result;
  // indicates that generated code accessed _w and/or _h index variables.
  bool* requires_sizes;
 };
 // Generates code for reading an element from an object.
 RewriteStatus GenerateReadAccessor(
    const Object& object,
    const object_accessor_internal::IndexedElement& element,
    std::string* result, bool* requires_sizes) {
  switch (object.object_type) {
    case ObjectType::BUFFER:
      return absl::visit(ReadFromBufferGenerator{object.data_type, element,
                                                 result, requires_sizes},
                         object.size);
    case ObjectType::TEXTURE:
      return absl::visit(ReadFromTextureGenerator{element, result},
                         object.size);
    case ObjectType::UNKNOWN:
      return RewriteStatus::ERROR;
  }
 }
 struct WriteToBufferGenerator {
  RewriteStatus operator()(uint32_t) const {
    if (element.indices.size() != 1) {
      result->append("WRONG_NUMBER_OF_INDICES");
      return RewriteStatus::ERROR;
    }
    absl::StrAppend(result, element.object_name, ".data[", element.indices[0],
                    "] = ");
    MaybeConvertToHalf(data_type, value, result);
    return RewriteStatus::SUCCESS;
  }
  RewriteStatus operator()(const uint2& size) const {
    if (element.indices.size() == 1) {
      // access by linear index. Use method above to generate accessor.
      return (*this)(1U);
    }
    if (element.indices.size() != 2) {
      result->append("WRONG_NUMBER_OF_INDICES");
      return RewriteStatus::ERROR;
    }
    absl::StrAppend(result, element.object_name, ".data[", element.indices[0],
                    " + $", element.object_name, "_w$ * (", element.indices[1],
                    ")] = ");
    MaybeConvertToHalf(data_type, value, result);
    *requires_sizes = true;
    return RewriteStatus::SUCCESS;
  }
  RewriteStatus operator()(const uint3& size) const {
    if (element.indices.size() == 1) {
      // access by linear index. Use method above to generate accessor.
      return (*this)(1U);
    }
    if (element.indices.size() != 3) {
      result->append("WRONG_NUMBER_OF_INDICES");
      return RewriteStatus::ERROR;
    }
    absl::StrAppend(result, element.object_name, ".data[", element.indices[0],
                    " + $", element.object_name, "_w$ * (", element.indices[1],
                    " + $", element.object_name, "_h$ * (", element.indices[2],
                    "))] = ");
    MaybeConvertToHalf(data_type, value, result);
    *requires_sizes = true;
    return RewriteStatus::SUCCESS;
  }
  DataType data_type;
  const object_accessor_internal::IndexedElement& element;
  absl::string_view value;
  std::string* result;
  // indicates that generated code accessed _w and/or _h index variables.
  bool* requires_sizes;
 };
 struct WriteToTextureGenerator {
  RewriteStatus operator()(uint32_t) const {
    if (element.indices.size() != 1) {
      result->append("WRONG_NUMBER_OF_INDICES");
      return RewriteStatus::ERROR;
    }
    // 1D textures are emulated as 2D textures
    absl::StrAppend(result, "imageStore(", element.object_name, ", ivec2(",
                    element.indices[0], ", 0), ", value, ")");
    return RewriteStatus::SUCCESS;
  }
  template <typename Shape>
  RewriteStatus operator()(const Shape&) const {
    if (element.indices.size() != Shape::size()) {
      result->append("WRONG_NUMBER_OF_INDICES");
      return RewriteStatus::ERROR;
    }
    absl::StrAppend(result, "imageStore(", element.object_name, ", ivec",
                    Shape::size(), "(", absl::StrJoin(element.indices, ", "),
                    "), ", value, ")");
    return RewriteStatus::SUCCESS;
  }
  const object_accessor_internal::IndexedElement& element;
  absl::string_view value;
  std::string* result;
 };
 // Generates code for writing value an element in an object.
 RewriteStatus GenerateWriteAccessor(
    const Object& object,
    const object_accessor_internal::IndexedElement& element,
    absl::string_view value, std::string* result, bool* requires_sizes) {
  switch (object.object_type) {
    case ObjectType::BUFFER:
      return absl::visit(WriteToBufferGenerator{object.data_type, element,
                                                value, result, requires_sizes},
                         object.size);
    case ObjectType::TEXTURE:
      return absl::visit(WriteToTextureGenerator{element, value, result},
                         object.size);
    case ObjectType::UNKNOWN:
      return RewriteStatus::ERROR;
  }
 }
 std::string ToAccessModifier(AccessType access, bool use_readonly_modifier) {
  switch (access) {
    case AccessType::READ:
      return use_readonly_modifier ? " readonly" : "";
    case AccessType::WRITE:
      return " writeonly";
    case AccessType::READ_WRITE:
      return " restrict";
  }
  return " unknown_access";
 }
 std::string ToBufferType(DataType data_type) {
  switch (data_type) {
    case DataType::UINT8:
    case DataType::UINT16:
    case DataType::UINT32:
      return "uvec4";
    case DataType::INT8:
    case DataType::INT16:
    case DataType::INT32:
      return "ivec4";
    case DataType::FLOAT16:
      return "uvec2";
    case DataType::FLOAT32:
      return "vec4";
    default:
      return "unknown";
  }
 }
 struct TextureImageTypeGetter {
  std::string operator()(uint32_t) const {
    // 1D textures are emulated as 2D textures
    return (*this)(uint2());
  }
  std::string operator()(const uint2&) const {
    switch (type) {
      case DataType::UINT16:
      case DataType::UINT32:
        return "uimage2D";
      case DataType::INT16:
      case DataType::INT32:
        return "iimage2D";
      case DataType::FLOAT16:
      case DataType::FLOAT32:
        return "image2D";
      default:
        return "unknown";
    }
  }
  std::string operator()(const uint3&) const {
    switch (type) {
      case DataType::UINT16:
      case DataType::UINT32:
        return "uimage2DArray";
      case DataType::INT16:
      case DataType::INT32:
        return "iimage2DArray";
      case DataType::FLOAT16:
      case DataType::FLOAT32:
        return "image2DArray";
      default:
        return "unknown";
    }
  }
  DataType type;
 };
 std::string ToImageType(const Object& object) {
  return absl::visit(TextureImageTypeGetter{object.data_type}, object.size);
 }
 std::string ToImageLayoutQualifier(DataType type) {
  switch (type) {
    case DataType::UINT16:
      return "rgba16ui";
    case DataType::UINT32:
      return "rgba32ui";
    case DataType::INT16:
      return "rgba16i";
    case DataType::INT32:
      return "rgba32i";
    case DataType::FLOAT16:
      return "rgba16f";
    case DataType::FLOAT32:
      return "rgba32f";
    default:
      return "unknown";
  }
 }
 std::string ToImagePrecision(DataType type) {
  switch (type) {
    case DataType::UINT16:
    case DataType::INT16:
    case DataType::FLOAT16:
      return "mediump";
    case DataType::UINT32:
    case DataType::INT32:
    case DataType::FLOAT32:
      return "highp";
    default:
      return "unknown";
  }
 }
 struct SizeParametersAdder {
  void operator()(uint32_t) const {}
  void operator()(const uint2& size) const {
    parameters->AddParameter(
        {absl::StrCat(object_name, "_w"), static_cast<int32_t>(size.x)});
  }
  // p1 and p2 are padding. For some reason buffer does not map correctly
  // without it.
  void operator()(const uint3& size) const {
    parameters->AddParameter(
        {absl::StrCat(object_name, "_w"), static_cast<int32_t>(size.x)});
    parameters->AddParameter(
        {absl::StrCat(object_name, "_h"), static_cast<int32_t>(size.y)});
  }
  absl::string_view object_name;
  ParameterAccessor* parameters;
 };
 // Adds necessary parameters to parameter accessor that represent object size
 // needed for indexed access.
 //  - 1D : empty
 //  - 2D : 'int object_name_w'
 //  - 3D : 'int object_name_w' + 'int object_name_h'
 void AddSizeParameters(absl::string_view object_name, const Object& object,
                       ParameterAccessor* parameters) {
  absl::visit(SizeParametersAdder{object_name, parameters}, object.size);
 }
 void GenerateObjectDeclaration(absl::string_view name, const Object& object,
                               std::string* declaration, bool is_mali) {
  switch (object.object_type) {
    case ObjectType::BUFFER:
      // readonly modifier used to fix shader compilation for Mali on Android 8,
      // see b/111601761
      absl::StrAppend(declaration, "layout(binding = ", object.binding, ")",
                      ToAccessModifier(object.access, !is_mali), " buffer B",
                      object.binding, " { ", ToBufferType(object.data_type),
                      " data[]; } ", name, ";\n");
      break;
    case ObjectType::TEXTURE:
      absl::StrAppend(declaration, "layout(",
                      ToImageLayoutQualifier(object.data_type),
                      ", binding = ", object.binding, ")",
                      ToAccessModifier(object.access, true), " uniform ",
                      ToImagePrecision(object.data_type), " ",
                      ToImageType(object), " ", name, ";\n");
      break;
    case ObjectType::UNKNOWN:
      // do nothing.
      break;
  }
 }
 }  // namespace
 RewriteStatus ObjectAccessor::Rewrite(absl::string_view input,
                                      std::string* output) {
  // Splits 'a  =b' into {'a','b'}.
  std::pair<absl::string_view, absl::string_view> n =
      absl::StrSplit(input, absl::MaxSplits('=', 1), absl::SkipWhitespace());
  if (n.first.empty()) {
    return RewriteStatus::NOT_RECOGNIZED;
  }
  if (n.second.empty()) {
    return RewriteRead(absl::StripAsciiWhitespace(n.first), output);
  }
  return RewriteWrite(absl::StripAsciiWhitespace(n.first),
                      absl::StripAsciiWhitespace(n.second), output);
 }
 RewriteStatus ObjectAccessor::RewriteRead(absl::string_view location,
                                          std::string* output) {
  auto element = object_accessor_internal::ParseElement(location);
  if (element.object_name.empty()) {
    return RewriteStatus::NOT_RECOGNIZED;
  }
  auto it = name_to_object_.find(
      std::string(element.object_name.data(), element.object_name.size()));
  if (it == name_to_object_.end()) {
    return RewriteStatus::NOT_RECOGNIZED;
  }
  bool requires_sizes = false;
  auto status =
      GenerateReadAccessor(it->second, element, output, &requires_sizes);
  if (requires_sizes) {
    AddSizeParameters(it->first, it->second, parameter_accessor_);
  }
  return status;
 }
 RewriteStatus ObjectAccessor::RewriteWrite(absl::string_view location,
                                           absl::string_view value,
                                           std::string* output) {
  // name[index1, index2...] = value
  auto element = object_accessor_internal::ParseElement(location);
  if (element.object_name.empty()) {
    return RewriteStatus::NOT_RECOGNIZED;
  }
  auto it = name_to_object_.find(
      std::string(element.object_name.data(), element.object_name.size()));
  if (it == name_to_object_.end()) {
    return RewriteStatus::NOT_RECOGNIZED;
  }
  bool requires_sizes = false;
  auto status = GenerateWriteAccessor(it->second, element, value, output,
                                      &requires_sizes);
  if (requires_sizes) {
    AddSizeParameters(it->first, it->second, parameter_accessor_);
  }
  return status;
 }
 bool ObjectAccessor::AddObject(const std::string& name, Object object) {
  if (object.object_type == ObjectType::UNKNOWN) {
    return false;
  }
  return name_to_object_.insert({name, std::move(object)}).second;
 }
 std::string ObjectAccessor::GetObjectDeclarations() const {
  std::string declarations;
  for (auto& o : name_to_object_) {
    GenerateObjectDeclaration(o.first, o.second, &declarations, is_mali_);
  }
  return declarations;
 }
 std::string ObjectAccessor::GetFunctionsDeclarations() const {
  std::string modifier = "";
  // Mali compiler does not want to compile a function without readonly
  // modifier. See b/111601761 for the context.
  if (is_mali_) {
    modifier = "readonly ";
  }
  // If there is a single object SSBO with F16, then we need to output functions
  // as well.
  for (const auto& o : name_to_object_) {
    if (o.second.data_type == DataType::FLOAT16 &&
        o.second.object_type == ObjectType::BUFFER) {
      return absl::StrCat("vec4 Vec4FromHalf(in ", modifier,
                          "uvec2 v) { return vec4(unpackHalf2x16(v.x), "
                          "unpackHalf2x16(v.y)); }\n"
                          "uvec2 Vec4ToHalf(in ",
                          modifier,
                          "vec4 v) { return uvec2(packHalf2x16(v.xy), "
                          "packHalf2x16(v.zw)); }\n");
    }
  }
  return "";
 }
 std::vector<Object> ObjectAccessor::GetObjects() const {
  std::vector<Object> objects;
  for (auto& o : name_to_object_) {
    objects.push_back(o.second);
  }
  return objects;
 }
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.h
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.h
@ -0,0 +1,105 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_OBJECT_ACCESSOR_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_OBJECT_ACCESSOR_H_
 #include <string>
 #include <unordered_map>
 #include <vector>
 #include "tensorflow/lite/delegates/gpu/gl/compiler/parameter_accessor.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/preprocessor.h"
 #include "tensorflow/lite/delegates/gpu/gl/object.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 // This rewrite handles access to objects both reads and writes.
 //
 // The following syntax is supported to access objects:
 //
 //   READ:
 //     vec4 value = $data[i]$;
 //       where data is a buffer or 1D texture
 //     vec4 value = $data[i,j]$;
 //       where data is 2D texture
 //     vec4 value = $data[i,j,k]$;
 //       where data is 3D texture
 //
 //   WRITE:
 //     $data[i] = value$;
 //       where data is a buffer or 1D texture
 //     $data[i,j] = value$;
 //       where data is 2D texture
 //     $data[i,j,k] = value$;
 //       where data is 3D texture
 //
 // Accessor supports all types (gvecN) as well as float16.
 //
 // TODO(akulik): support field in data[x,y,z].x
 //
 class ObjectAccessor : public InlineRewrite {
 public:
  ObjectAccessor(bool is_mali, ParameterAccessor* parameter_accessor)
      : is_mali_(is_mali), parameter_accessor_(parameter_accessor) {}
  RewriteStatus Rewrite(absl::string_view input, std::string* output) final;
  // Return true if object was successfully added.
  bool AddObject(const std::string& name, Object object);
  // Returns objects declarations that need to be added in a shader's code.
  std::string GetObjectDeclarations() const;
  // Returns functions declarations that need to be added in a shader's code.
  // These functions are used by code accessing objects.
  std::string GetFunctionsDeclarations() const;
  // Returns a collection of registered objects
  std::vector<Object> GetObjects() const;
 private:
  RewriteStatus RewriteRead(absl::string_view location, std::string* output);
  RewriteStatus RewriteWrite(absl::string_view location,
                             absl::string_view value, std::string* output);
  std::unordered_map<std::string, Object> name_to_object_;
  const bool is_mali_;
  ParameterAccessor* parameter_accessor_;
 };
 // Implementation details below.
 namespace object_accessor_internal {
 // Refers to an element in an object.
 struct IndexedElement {
  absl::string_view object_name;
  std::vector<absl::string_view> indices;
 };
 // Splits name[index1, index2...] into 'name' and {'index1', 'index2'...}.
 IndexedElement ParseElement(absl::string_view input);
 }  // namespace object_accessor_internal
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_OBJECT_ACCESSOR_H_
--- a/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor_test.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor_test.cc
@ -0,0 +1,206 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.h"
 #include <string>
 #include <vector>
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/types/variant.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/parameter_accessor.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 struct ParameterComparator {
  template <typename T>
  bool operator()(const T& t) const {
    const T* v = absl::get_if<T>(&p.value);
    return v && t == *v;
  }
  const UniformParameter& p;
 };
 // partially equal
 bool operator==(const UniformParameter& l, const UniformParameter& r) {
  return l.name == r.name && absl::visit(ParameterComparator{l}, r.value);
 }
 namespace {
 TEST(Preprocessor, CornerCases) {
  ParameterAccessor parameters(false);
  ObjectAccessor accessor(false, &parameters);
  std::string result;
  ASSERT_EQ(accessor.Rewrite("", &result), RewriteStatus::NOT_RECOGNIZED);
  ASSERT_EQ(accessor.Rewrite("=", &result), RewriteStatus::NOT_RECOGNIZED);
 }
 TEST(Preprocessor, ReadFromBuffer) {
  ParameterAccessor parameters(false);
  ObjectAccessor accessor(false, &parameters);
  ASSERT_TRUE(
      accessor.AddObject("obj", MakeReadonlyBuffer(std::vector<float>{1.0})));
  std::string result;
  EXPECT_EQ(accessor.Rewrite("obj[i]", &result), RewriteStatus::SUCCESS);
  EXPECT_TRUE(parameters.GetUniformParameters().empty());
  ASSERT_EQ(result, "obj.data[i]");
 }
 TEST(Preprocessor, ReadFromBufferLinear) {
  ParameterAccessor parameters(false);
  ObjectAccessor accessor(false, &parameters);
  ASSERT_TRUE(accessor.AddObject(
      "obj", MakeReadonlyBuffer(uint3(1, 2, 3), std::vector<float>{1.0})));
  std::string result;
  EXPECT_EQ(accessor.Rewrite("obj[i]", &result), RewriteStatus::SUCCESS);
  EXPECT_TRUE(parameters.GetUniformParameters().empty());
  ASSERT_EQ(result, "obj.data[i]");
 }
 TEST(Preprocessor, ReadFromBufferByIndex) {
  ParameterAccessor parameters(false);
  ObjectAccessor accessor(false, &parameters);
  ASSERT_TRUE(accessor.AddObject(
      "obj", MakeReadonlyBuffer(uint3(1, 2, 3), std::vector<float>{1.0})));
  std::string result;
  EXPECT_EQ(accessor.Rewrite("obj[x,y + 5,z]", &result),
            RewriteStatus::SUCCESS);
  EXPECT_THAT(parameters.GetUniformParameters(),
              testing::UnorderedElementsAre(UniformParameter{"obj_w", 1},
                                            UniformParameter{"obj_h", 2}));
  ASSERT_EQ(result, "obj.data[x + $obj_w$ * (y + 5 + $obj_h$ * (z))]");
 }
 TEST(Preprocessor, ReadFromTexture) {
  ParameterAccessor parameters(false);
  ObjectAccessor accessor(false, &parameters);
  ASSERT_TRUE(accessor.AddObject(
      "obj", MakeReadonlyTexture(uint3(1, 2, 3), {1.0, 2.0, 3.0, 4.0})));
  std::string result;
  EXPECT_EQ(accessor.Rewrite("obj[i,j,k]", &result), RewriteStatus::SUCCESS);
  // textures don't need extra variables to be stored for indexed access
  EXPECT_TRUE(parameters.GetUniformParameters().empty());
  ASSERT_EQ(result, "imageLoad(obj, ivec3(i, j, k))");
 }
 TEST(Preprocessor, ReadFromTexture1D) {
  ParameterAccessor parameters(false);
  ObjectAccessor accessor(false, &parameters);
  ASSERT_TRUE(
      accessor.AddObject("obj", MakeReadonlyTexture({1.0, 2.0, 3.0, 4.0})));
  std::string result;
  EXPECT_EQ(accessor.Rewrite("obj[i]", &result), RewriteStatus::SUCCESS);
  EXPECT_TRUE(parameters.GetUniformParameters().empty());
  ASSERT_EQ(result, "imageLoad(obj, ivec2(i, 0))");
 }
 TEST(Preprocessor, WriteToBuffer) {
  ParameterAccessor parameters(false);
  ObjectAccessor accessor(false, &parameters);
  ASSERT_TRUE(
      accessor.AddObject("obj", MakeReadonlyBuffer(std::vector<float>{1.0})));
  std::string result;
  EXPECT_EQ(accessor.Rewrite(" obj[i]  =value", &result),
            RewriteStatus::SUCCESS);
  EXPECT_TRUE(parameters.GetUniformParameters().empty());
  ASSERT_EQ(result, "obj.data[i] = value");
 }
 TEST(Preprocessor, WriteToBufferByIndex) {
  ParameterAccessor parameters(false);
  ObjectAccessor accessor(false, &parameters);
  ASSERT_TRUE(accessor.AddObject(
      "obj", MakeReadonlyBuffer(uint3(1, 2, 3), {1.0, 2.0, 3.0, 4.0})));
  std::string result;
  EXPECT_EQ(accessor.Rewrite(" obj[i,j,k]  =value", &result),
            RewriteStatus::SUCCESS);
  EXPECT_THAT(parameters.GetUniformParameters(),
              testing::UnorderedElementsAre(UniformParameter{"obj_w", 1},
                                            UniformParameter{"obj_h", 2}));
  ASSERT_EQ(result, "obj.data[i + $obj_w$ * (j + $obj_h$ * (k))] = value");
 }
 TEST(Preprocessor, WriteToTexture) {
  ParameterAccessor parameters(false);
  ObjectAccessor accessor(false, &parameters);
  ASSERT_TRUE(accessor.AddObject(
      "obj", MakeReadonlyTexture(uint3(1, 1, 1), {1.0, 2.0, 3.0, 4.0})));
  std::string result;
  EXPECT_EQ(accessor.Rewrite("obj[i,j,k]= value ", &result),
            RewriteStatus::SUCCESS);
  ASSERT_EQ(result, "imageStore(obj, ivec3(i, j, k), value)");
 }
 TEST(Preprocessor, WriteToTexture1D) {
  ParameterAccessor parameters(false);
  ObjectAccessor accessor(false, &parameters);
  ASSERT_TRUE(
      accessor.AddObject("obj", MakeReadonlyTexture({1.0, 2.0, 3.0, 4.0})));
  std::string result;
  EXPECT_EQ(accessor.Rewrite("obj[i]= value ", &result),
            RewriteStatus::SUCCESS);
  EXPECT_TRUE(parameters.GetUniformParameters().empty());
  ASSERT_EQ(result, "imageStore(obj, ivec2(i, 0), value)");
 }
 TEST(Preprocessor, FailedWriteToBuffer) {
  ParameterAccessor parameters(false);
  ObjectAccessor accessor(false, &parameters);
  ASSERT_TRUE(
      accessor.AddObject("obj", MakeReadonlyBuffer(std::vector<float>{1.0})));
  std::string result;
  EXPECT_EQ(accessor.Rewrite(" obj[i,j]  =value", &result),
            RewriteStatus::ERROR);
  ASSERT_EQ(result, "WRONG_NUMBER_OF_INDICES");
 }
 TEST(Preprocessor, FailedWriteToTexture) {
  ParameterAccessor parameters(false);
  ObjectAccessor accessor(false, &parameters);
  ASSERT_TRUE(accessor.AddObject(
      "obj", MakeReadonlyTexture(uint3(1, 1, 1), {1.0, 2.0, 3.0, 4.0})));
  std::string result;
  EXPECT_EQ(accessor.Rewrite("obj[i]= value ", &result), RewriteStatus::ERROR);
  ASSERT_EQ(result, "WRONG_NUMBER_OF_INDICES");
 }
 TEST(Preprocessor, DeclareTexture) {
  ParameterAccessor parameters(false);
  ObjectAccessor accessor(false, &parameters);
  ASSERT_TRUE(accessor.AddObject(
      "obj", MakeReadonlyTexture(uint3(1, 1, 1), {1.0, 2.0, 3.0, 4.0})));
  ASSERT_EQ(accessor.GetObjectDeclarations(),
            "layout(rgba32f, binding = 0) readonly uniform highp image2DArray "
            "obj;\n");
 }
 TEST(Preprocessor, DeclareBuffer) {
  ParameterAccessor parameters(false);
  ObjectAccessor accessor(true, &parameters);
  ASSERT_TRUE(
      accessor.AddObject("obj", MakeReadonlyBuffer(std::vector<float>{1.0})));
  ASSERT_EQ(accessor.GetObjectDeclarations(),
            "layout(binding = 0) buffer B0 { vec4 data[]; } obj;\n");
 }
 }  // namespace
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/gl/compiler/parameter_accessor.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/parameter_accessor.cc
@ -0,0 +1,368 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/gl/compiler/parameter_accessor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/variant.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 namespace parameter_accessor_internal {
 // Parse the following regex manually
 // name(\[index\])?(\.field)?
 ParameterReference Parse(absl::string_view input) {
  ParameterReference ref;
  auto start_index = input.find('[');
  if (start_index != std::string::npos) {
    auto end_index = input.rfind(']');
    if (end_index == std::string::npos) {
      return ref;
    }
    ref.index = input.substr(start_index + 1, end_index - start_index - 1);
    ref.name = input.substr(0, start_index);
    ref.field = input.substr(end_index + 1);
  } else {
    auto dot = input.find('.');
    if (dot != std::string::npos) {
      ref.name = input.substr(0, dot);
      ref.field = input.substr(dot);
    } else {
      ref.name = input;
    }
  }
  return ref;
 }
 }  // namespace parameter_accessor_internal
 namespace {
 struct UniformTypeGetter {
  std::string operator()(int) const { return "int"; }
  std::string operator()(const int2&) const { return "ivec2"; }
  std::string operator()(const std::vector<int2>&) const { return "ivec2"; }
  std::string operator()(const int4&) const { return "ivec4"; }
  std::string operator()(unsigned int) const { return "uint"; }
  std::string operator()(const uint4&) const { return "uvec4"; }
  std::string operator()(float) const { return "float"; }
  std::string operator()(const float2&) const { return "vec2"; }
  std::string operator()(const float4&) const { return "vec4"; }
 };
 // Returns GLSL uniform type of the given parameter.
 std::string GetUniformType(const UniformParameter::ValueType& value) {
  return absl::visit(UniformTypeGetter(), value);
 }
 template <typename T>
 void FormatValue(std::string* result, T t) {
  absl::StrAppend(result, t);
 }
 template <>
 void FormatValue(std::string* result, float t) {
  absl::StrAppend(result, absl::StrFormat("%.9ff", t));
 }
 // Unfortunately absl::StrJoin with custom formatter requires formatter to use
 // string, not std::string. Therefore, due to this compatibility issue data
 // needs to be converted to string representation first and then joined.
 template <typename T, int N>
 std::vector<std::string> ToString(const std::array<T, N>& data) {
  std::vector<std::string> result(N);
  for (int i = 0; i < N; ++i) {
    FormatValue(&result[i], data[i]);
  }
  return result;
 }
 struct ConstGenerator {
  template <typename T>
  void operator()(T t) const {
    FormatValue(result, t);
  }
  template <typename T>
  void operator()(const Vec2<T>& v) const {
    absl::StrAppend(result, UniformTypeGetter()(v), "(",
                    absl::StrJoin(ToString<T, 2>(v.data_), ","), ")");
  }
  template <typename T>
  void operator()(const Vec3<T>& v) const {
    absl::StrAppend(result, UniformTypeGetter()(v), "(",
                    absl::StrJoin(ToString<T, 3>(v.data_), ","), ")");
  }
  template <typename T>
  void operator()(const Vec4<T>& v) const {
    absl::StrAppend(result, UniformTypeGetter()(v), "(",
                    absl::StrJoin(ToString<T, 4>(v.data_), ","), ")");
  }
  template <typename T>
  void operator()(const std::vector<T>& v) const {
    std::string type = UniformTypeGetter()(v);
    absl::StrAppend(result, type, "[", v.size(), "](");
    bool first = true;
    for (const auto& i : v) {
      if (first) {
        first = false;
      } else {
        absl::StrAppend(result, ",");
      }
      (*this)(i);
    }
    absl::StrAppend(result, ")");
  }
  std::string* result;
 };
 // Appends string representation of a parameter value.
 void GetValue(const UniformParameter::ValueType& value, std::string* result) {
  absl::visit(ConstGenerator{result}, value);
 }
 struct UniformDeclarationGenerator {
  template <typename T>
  void operator()(const T&) const {
    absl::StrAppend(result, "uniform ", GetUniformType(param.value), " ",
                    param.name, ";\n");
  }
  template <typename T>
  void operator()(const std::vector<T>& v) const {
    absl::StrAppend(result, "uniform ", GetUniformType(param.value), " ",
                    param.name, "[", v.size(), "];\n");
  }
  const UniformParameter& param;
  std::string* result;
 };
 void GenerateUniformDeclaration(const UniformParameter& parameter,
                                std::string* result) {
  absl::visit(UniformDeclarationGenerator{parameter, result}, parameter.value);
 }
 struct VariableLengthGetter {
  template <typename T>
  bool operator()(const T&) const {
    return false;
  }
  template <typename T>
  bool operator()(const std::vector<T>&) const {
    return true;
  }
 };
 // Returns true if value is a vector
 bool IsVariableLength(const UniformParameter::ValueType& value) {
  return absl::visit(VariableLengthGetter(), value);
 }
 enum Field : uint8_t { UNKNOWN = 4, X = 0, Y = 1, Z = 2, W = 3 };
 Field ToField(absl::string_view field_name) {
  if (field_name.size() == 2 && field_name[0] == '.') {
    switch (field_name[1]) {
      case 'x':
        return Field::X;
      case 'y':
        return Field::Y;
      case 'z':
        return Field::Z;
      case 'w':
        return Field::W;
    }
  }
  return Field::UNKNOWN;
 }
 struct FieldAccessor {
  template <typename T>
  void operator()(const T&) const {}
  template <typename T>
  void operator()(const Vec2<T>& v) const {
    FormatValue(result, v[field]);
  }
  template <typename T>
  void operator()(const Vec3<T>& v) const {
    FormatValue(result, v[field]);
  }
  template <typename T>
  void operator()(const Vec4<T>& v) const {
    FormatValue(result, v[field]);
  }
  Field field;
  std::string* result;
 };
 // Appends formatted value of the given field.
 void GetValue(const UniformParameter::ValueType& value, Field field,
              std::string* result) {
  absl::visit(FieldAccessor{field, result}, value);
 }
 struct FieldChecker {
  // For trivial as well as variable-length types indexed access is not allowed.
  template <typename T>
  bool operator()(const T&) const {
    return false;
  }
  template <typename T>
  bool operator()(const Vec2<T>& v) const {
    return field < v.size();
  }
  template <typename T>
  bool operator()(const Vec3<T>& v) const {
    return field < v.size();
  }
  template <typename T>
  bool operator()(const Vec4<T>& v) const {
    return field < v.size();
  }
  template <typename T>
  bool operator()(const std::vector<T>&) const {
    // technically accessing [0] element of an empty vector is UB, but we need
    // only type information for this check. Therefore, construct default T and
    // use it instead.
    T t;
    return (*this)(t);
  }
  Field field;
 };
 // Returns true if field has field access and field is not out of bounds.
 bool HasField(const UniformParameter::ValueType& value, Field field) {
  return absl::visit(FieldChecker{field}, value);
 }
 void AssembleAccessor(absl::string_view name, absl::string_view index,
                      absl::string_view field, std::string* result) {
  if (index.empty()) {
    absl::StrAppend(result, name, field);
  } else {
    absl::StrAppend(result, name, "[", index, "]", field);
  }
 }
 }  // namespace
 RewriteStatus ParameterAccessor::Rewrite(absl::string_view input,
                                         std::string* output) {
  auto ref = parameter_accessor_internal::Parse(input);
  if (ref.name.empty()) {
    absl::StrAppend(output, "INVALID_SYNTAX");
    return RewriteStatus::ERROR;
  }
  auto it = name_to_param_.find(std::string(ref.name.data(), ref.name.size()));
  if (it == name_to_param_.end()) {
    // Uniform with this name is not registered.
    return RewriteStatus::NOT_RECOGNIZED;
  }
  const auto& value = it->second.value;
  if (!ref.index.empty() && !IsVariableLength(value)) {
    // Trying to access parameter by index, but it is not variable-length.
    absl::StrAppend(output, "INVALID_ACCESS_BY_INDEX");
    return RewriteStatus::ERROR;
  }
  Field f = ToField(ref.field);
  if (!ref.field.empty() && !HasField(value, f)) {
    // Trying to access a parameter by field, but it does not have it.
    absl::StrAppend(output, "INVALID_ACCESS_BY_FIELD");
    return RewriteStatus::ERROR;
  }
  // Error checks are complete now.
  // All variable-length parameters are encoded as-is without inlining.
  if (!inline_values_ || IsVariableLength(value)) {
    AssembleAccessor(it->second.name, ref.index, ref.field, output);
  } else {
    // Parameter + field is replaced with field value.
    if (f != Field::UNKNOWN) {
      GetValue(value, f, output);
    } else {
      // Parameter is accessed directly.
      GetValue(value, output);
    }
  }
  return RewriteStatus::SUCCESS;
 }
 bool ParameterAccessor::AddParameter(UniformParameter param) {
  std::string name = param.name;
  return name_to_param_.insert({name, std::move(param)}).second;
 }
 std::string ParameterAccessor::GetConstDeclarations() const {
  // Variable length parameters are declared as const and accessed via variable
  // with index.
  std::string declarations;
  for (auto& param : name_to_param_) {
    const auto& value = param.second.value;
    if (IsVariableLength(value)) {
      absl::StrAppend(&declarations, "const ", GetUniformType(value), " ",
                      param.second.name, "[] = ");
      GetValue(value, &declarations);
      absl::StrAppend(&declarations, ";\n");
    }
  }
  return declarations;
 }
 std::string ParameterAccessor::GetUniformDeclarations() const {
  std::string declarations;
  if (!inline_values_) {
    for (auto& param : name_to_param_) {
      GenerateUniformDeclaration(param.second, &declarations);
    }
  }
  return declarations;
 }
 std::vector<UniformParameter> ParameterAccessor::GetUniformParameters() const {
  std::vector<UniformParameter> params;
  if (!inline_values_) {
    for (auto& param : name_to_param_) {
      params.push_back(param.second);
    }
  }
  return params;
 }
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/gl/compiler/parameter_accessor.h
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/parameter_accessor.h
@ -0,0 +1,92 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_PARAMETER_ACCESSOR_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_PARAMETER_ACCESSOR_H_
 #include <string>
 #include <unordered_map>
 #include <vector>
 #include "tensorflow/lite/delegates/gpu/gl/compiler/preprocessor.h"
 #include "tensorflow/lite/delegates/gpu/gl/uniform_parameter.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 // This rewrite handles access to parameters. It may rewrite a parameter with
 // actual values if inline_values is set to true.
 //
 // The following syntax is supported to access parameters:
 //  - simple parameter: name
 //  - parameter with field: name.(x|y|z|w)
 //  - parameter with index: name[i]
 //  - parameter with index and field: name[i].(x|y|z|w)
 //
 // If 'inline_values' is set to true, non variable-length parameters will be
 // inlined. For example, 'base.x' will be replaced with value of 'x' field from
 // 'base'. Variable-length are declared as const and accessed via index.
 // These declarations are returned by GetConstDeclarations.
 //
 // If 'inline_values' is set to false, all parameters will be declared as
 // uniforms. Uniform declarations are returned by GetUniformDeclarations.
 class ParameterAccessor : public InlineRewrite {
 public:
  explicit ParameterAccessor(bool inline_values)
      : inline_values_(inline_values) {}
  RewriteStatus Rewrite(absl::string_view input, std::string* output) final;
  // Return true if parameter was successfully added.
  bool AddParameter(UniformParameter param);
  // Returns const parameters that need to be inlined in the a shader's code.
  std::string GetConstDeclarations() const;
  // Returns uniforms declarations that need to be inlined in a shader's code.
  std::string GetUniformDeclarations() const;
  // Returns a collection of uniform parameters.
  std::vector<UniformParameter> GetUniformParameters() const;
 private:
  const bool inline_values_;
  // Unique parameter index used for obfuscation.
  uint32_t unique_param_index_ = 0;
  std::unordered_map<std::string, UniformParameter> name_to_param_;
 };
 // Implementation details below.
 namespace parameter_accessor_internal {
 struct ParameterReference {
  absl::string_view name;
  absl::string_view index;
  absl::string_view field;
 };
 // Parse the following regex manually
 // name(\[index\])?(\.field)?
 ParameterReference Parse(absl::string_view input);
 }  // namespace parameter_accessor_internal
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_PARAMETER_ACCESSOR_H_
--- a/tensorflow/lite/delegates/gpu/gl/compiler/parameter_accessor_test.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/parameter_accessor_test.cc
@ -0,0 +1,98 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/gl/compiler/parameter_accessor.h"
 #include <string>
 #include <vector>
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 namespace {
 TEST(Preprocessor, CornerCases) {
  ParameterAccessor accessor(true);
  std::string result;
  ASSERT_EQ(accessor.Rewrite("unknown", &result),
            RewriteStatus::NOT_RECOGNIZED);
 }
 TEST(Preprocessor, Value) {
  ParameterAccessor accessor(true);
  ASSERT_TRUE(accessor.AddParameter(UniformParameter{"var", int32_t(1)}));
  std::string result;
  EXPECT_EQ(accessor.Rewrite("var", &result), RewriteStatus::SUCCESS);
  ASSERT_EQ(result, "1");
 }
 TEST(Preprocessor, ValueVec) {
  ParameterAccessor accessor(true);
  ASSERT_TRUE(accessor.AddParameter(UniformParameter{"var", int2(1, 2)}));
  std::string result;
  EXPECT_EQ(accessor.Rewrite("var", &result), RewriteStatus::SUCCESS);
  ASSERT_EQ(result, "ivec2(1,2)");
 }
 TEST(Preprocessor, Field) {
  ParameterAccessor accessor(true);
  ASSERT_TRUE(
      accessor.AddParameter(UniformParameter{"var", float2(1.0, 2.1234567)}));
  std::string result;
  EXPECT_EQ(accessor.Rewrite("var.y", &result), RewriteStatus::SUCCESS);
  ASSERT_EQ(result, "2.123456717f");
 }
 TEST(Preprocessor, FieldFail) {
  ParameterAccessor accessor(true);
  ASSERT_TRUE(accessor.AddParameter(UniformParameter{"var", 1.0f}));
  ASSERT_TRUE(accessor.AddParameter(UniformParameter{"vec", float2(1.0, 1.0)}));
  std::string result;
  EXPECT_EQ(accessor.Rewrite("var.y", &result), RewriteStatus::ERROR);
  ASSERT_EQ(result, "INVALID_ACCESS_BY_FIELD");
  result.clear();
  EXPECT_EQ(accessor.Rewrite("vec.z", &result), RewriteStatus::ERROR);
  ASSERT_EQ(result, "INVALID_ACCESS_BY_FIELD");
 }
 TEST(Preprocessor, Variable) {
  ParameterAccessor accessor(true);
  std::vector<int2> v;
  v.push_back(int2(1, 2));
  ASSERT_TRUE(accessor.AddParameter(UniformParameter{"var", v}));
  std::string result;
  EXPECT_EQ(accessor.Rewrite("var[i].y", &result), RewriteStatus::SUCCESS);
  ASSERT_EQ(result, "var[i].y");
  ASSERT_EQ(accessor.GetConstDeclarations(),
            "const ivec2 var[] = ivec2[1](ivec2(1,2));\n");
 }
 TEST(Preprocessor, InlineVariableFail) {
  ParameterAccessor accessor(true);
  ASSERT_TRUE(accessor.AddParameter(UniformParameter{"var", 1}));
  std::string result;
  EXPECT_EQ(accessor.Rewrite("var[i]", &result), RewriteStatus::ERROR);
  ASSERT_EQ(result, "INVALID_ACCESS_BY_INDEX");
 }
 }  // namespace
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/gl/compiler/preprocessor.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/preprocessor.cc
@ -0,0 +1,95 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/gl/compiler/preprocessor.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 namespace {
 // Given input string and a delimiter returns back a substring including
 // delimiters. If there was only starting delimiter found, returns single char.
 absl::string_view FindInlineBlock(absl::string_view s, char delimiter) {
  size_t start = s.find(delimiter);
  if (start != absl::string_view::npos) {
    size_t end = s.find(delimiter, start + 1);
    if (end != std::string::npos) {
      return s.substr(start, end - start + 1);
    }
    // Special case to indicate that we didn't find the end.
    return s.substr(start, 1);
  }
  return s.substr(s.size(), 0);
 }
 // For the given 's' and its substring 'subs' returns new substring of 's' that
 // begins past 'subs'.
 absl::string_view PastSubstr(absl::string_view s, absl::string_view subs) {
  return s.substr(subs.data() + subs.size() - s.data());
 }
 }  // namespace
 Status TextPreprocessor::Rewrite(const std::string& input,
                                 std::string* output) {
  absl::string_view s = input;
  std::string result;
  while (true) {
    absl::string_view inline_block = FindInlineBlock(s, inline_delimiter_);
    result.append(s.data(), inline_block.data() - s.data());
    if (inline_block.empty()) {
      break;
    }
    if (inline_block.size() == 1) {
      return NotFoundError("Unable to find end of inline block");
    }
    s = PastSubstr(s, inline_block);
    bool processed = false;
    for (auto& rewrite : inline_rewrites_) {
      if (processed) {
        break;
      }
      switch (rewrite->Rewrite(inline_block.substr(1, inline_block.size() - 2),
                               &result)) {
        case RewriteStatus::NOT_RECOGNIZED:
          // try another rewrite.
          break;
        case RewriteStatus::SUCCESS:
          processed = true;
          break;
        case RewriteStatus::ERROR:
          return InternalError(absl::StrCat("Error while rewriting '",
                                            inline_block, "': ", result));
      }
    }
    if (!processed) {
      if (!keep_unknown_rewrites_) {
        return NotFoundError(absl::StrCat("Didn't find inline rewrite for '",
                                          inline_block, "'"));
      }
      absl::StrAppend(&result, inline_block);
    }
  }
  *output = std::move(result);
  return OkStatus();
 }
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/gl/compiler/preprocessor.h
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/preprocessor.h
@ -0,0 +1,74 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_PREPROCESSOR_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_PREPROCESSOR_H_
 #include <memory>
 #include <string>
 #include <vector>
 #include "absl/strings/string_view.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 enum class RewriteStatus {
  SUCCESS = 0,
  NOT_RECOGNIZED = 1,
  ERROR = 2,
 };
 // Inline rewrite matches a string and rewrites it.
 class InlineRewrite {
 public:
  virtual ~InlineRewrite() = default;
  virtual RewriteStatus Rewrite(absl::string_view input,
                                std::string* output) = 0;
 };
 // Text preprocessor runs a collection of registered rewrites.
 // It uses a single character prefix as inline delimiter that needs to quote
 // text to be rewritten.
 class TextPreprocessor {
 public:
  // @param keep_unknown_rewrites if true, will keep unhandled rewrites as is
  // instead of reporting an error.
  TextPreprocessor(char inline_delimiter, bool keep_unknown_rewrites)
      : inline_delimiter_(inline_delimiter),
        keep_unknown_rewrites_(keep_unknown_rewrites) {}
  void AddRewrite(InlineRewrite* rewrite) {
    inline_rewrites_.push_back(rewrite);
  }
  // input and output may point to the same object.
  Status Rewrite(const std::string& input, std::string* output);
 private:
  const char inline_delimiter_;
  const bool keep_unknown_rewrites_;
  std::vector<InlineRewrite*> inline_rewrites_;
 };
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_PREPROCESSOR_H_
--- a/tensorflow/lite/delegates/gpu/gl/compiler/preprocessor_test.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/preprocessor_test.cc
@ -0,0 +1,129 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/gl/compiler/preprocessor.h"
 #include <string>
 #include <vector>
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 namespace tflite {
 namespace gpu {
 namespace gl {
 namespace {
 class AccuInlineRewrite : public InlineRewrite {
 public:
  explicit AccuInlineRewrite(std::vector<std::string>* blocks)
      : blocks_(blocks) {}
  RewriteStatus Rewrite(absl::string_view input, std::string* output) final {
    blocks_->push_back(std::string(input.data(), input.size()));
    output->append("r:");
    output->append(input.data(), input.size());
    return RewriteStatus::SUCCESS;
  }
  std::vector<std::string>* blocks_;
 };
 std::vector<std::string> ParseInlines(const std::string& text) {
  std::vector<std::string> blocks;
  TextPreprocessor preprocessor('$', false);
  AccuInlineRewrite rewrite(&blocks);
  preprocessor.AddRewrite(&rewrite);
  std::string discard;
  preprocessor.Rewrite(text, &discard).IgnoreError();
  return blocks;
 }
 TEST(Preprocessor, CornerCases) {
  EXPECT_THAT(ParseInlines(""), testing::ElementsAre());
  EXPECT_THAT(ParseInlines("text text"), testing::ElementsAre());
  EXPECT_THAT(ParseInlines("$$"), testing::ElementsAre(""));
 }
 TEST(Preprocessor, One) {
  EXPECT_THAT(ParseInlines("$text$"), testing::ElementsAre("text"));
  EXPECT_THAT(ParseInlines(" $text$ "), testing::ElementsAre("text"));
 }
 TEST(Preprocessor, More) {
  EXPECT_THAT(ParseInlines("Test $inline1$\n$inline2$ test $inline3$ "),
              testing::ElementsAre("inline1", "inline2", "inline3"));
 }
 std::string RewriteInlines(const std::string& text) {
  std::vector<std::string> blocks;
  TextPreprocessor preprocessor('$', false);
  AccuInlineRewrite rewrite(&blocks);
  preprocessor.AddRewrite(&rewrite);
  std::string out;
  preprocessor.Rewrite(text, &out).IgnoreError();
  return out;
 }
 TEST(Preprocessor, RewriteCornerCases) {
  EXPECT_EQ(RewriteInlines(""), "");
  EXPECT_EQ(RewriteInlines("text text"), "text text");
  EXPECT_EQ(RewriteInlines("$$"), "r:");
 }
 TEST(Preprocessor, RewriteOne) {
  EXPECT_EQ(RewriteInlines("$text$"), "r:text");
  EXPECT_EQ(RewriteInlines(" $text$ "), " r:text ");
 }
 TEST(Preprocessor, RewriteMore) {
  EXPECT_EQ(RewriteInlines("Test $inline1$\n$inline2$ test $inline3$ "),
            "Test r:inline1\nr:inline2 test r:inline3 ");
 }
 class SingleRewrite : public InlineRewrite {
 public:
  RewriteStatus Rewrite(absl::string_view input, std::string* output) final {
    if (input == "foo") {
      output->append("bla");
      return RewriteStatus::SUCCESS;
    }
    return RewriteStatus::NOT_RECOGNIZED;
  }
  std::vector<std::string>* blocks_;
 };
 TEST(Preprocessor, KeepUnknownRewrites) {
  TextPreprocessor preprocessor('$', true);
  SingleRewrite rewrite;
  preprocessor.AddRewrite(&rewrite);
  std::string out;
  ASSERT_TRUE(preprocessor.Rewrite("Good morning, $name$! $foo$", &out).ok());
  EXPECT_EQ("Good morning, $name$! bla", out);
 }
 TEST(Preprocessor, KeepUnknownRewrites_Fail) {
  TextPreprocessor preprocessor('$', false);
  SingleRewrite rewrite;
  preprocessor.AddRewrite(&rewrite);
  std::string out;
  EXPECT_FALSE(preprocessor.Rewrite("Good morning, $name$! $foo$", &out).ok());
 }
 }  // namespace
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/gl/compiler/rename.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/rename.cc
@ -0,0 +1,203 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/gl/compiler/rename.h"
 #include <algorithm>
 #include <unordered_map>
 #include <utility>
 #include <vector>
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/parameter_accessor.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/preprocessor.h"
 #include "tensorflow/lite/delegates/gpu/gl/object.h"
 #include "tensorflow/lite/delegates/gpu/gl/uniform_parameter.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 namespace {
 // Rewrites names of all parameters according to returned values from the
 // given NameFunctor.
 class ParameterRewriter : public InlineRewrite {
 public:
  ParameterRewriter(const std::string& inline_delimiter,
                    const NameFunctor& name_func)
      : inline_delimiter_(inline_delimiter), name_func_(name_func) {}
  RewriteStatus Rewrite(absl::string_view input, std::string* output) final {
    auto ref = parameter_accessor_internal::Parse(input);
    if (ref.name.empty()) {
      absl::StrAppend(output, "INVALID_SYNTAX");
      return RewriteStatus::ERROR;
    }
    auto it =
        name_to_param_.find(std::string(ref.name.data(), ref.name.size()));
    if (it == name_to_param_.end()) {
      return RewriteStatus::NOT_RECOGNIZED;
    }
    // reconstruct access using the new name.
    absl::StrAppend(output, inline_delimiter_, it->second.name);
    if (!ref.index.empty()) {
      absl::StrAppend(output, "[", ref.index, "]");
    }
    absl::StrAppend(output, ref.field, inline_delimiter_);
    return RewriteStatus::SUCCESS;
  }
  // Return true if parameter was successfully added.
  bool AddParameter(UniformParameter param) {
    std::string old_name = param.name;
    param.name = name_func_(old_name);
    return name_to_param_.insert({old_name, std::move(param)}).second;
  }
  // Returns a collection of uniform parameters with updated names.
  std::vector<UniformParameter> GetUniformParameters() const {
    std::vector<UniformParameter> params;
    params.reserve(name_to_param_.size());
    for (auto& param : name_to_param_) {
      params.push_back(param.second);
    }
    return params;
  }
 private:
  const std::string inline_delimiter_;
  const NameFunctor name_func_;
  std::unordered_map<std::string, UniformParameter> name_to_param_;
 };
 // Rewrites names of all objects according to returned values from the
 // given NameFunctor.
 class ObjectRewriter : public InlineRewrite {
 public:
  ObjectRewriter(const std::string& inline_delimiter,
                 const NameFunctor& name_func)
      : inline_delimiter_(inline_delimiter), name_func_(name_func) {}
  RewriteStatus Rewrite(absl::string_view input, std::string* output) final {
    // Splits 'a = b' into {'a','b'}.
    std::pair<absl::string_view, absl::string_view> n =
        absl::StrSplit(input, absl::MaxSplits('=', 1), absl::SkipWhitespace());
    if (n.first.empty()) {
      return RewriteStatus::NOT_RECOGNIZED;
    }
    if (n.second.empty()) {
      return RewriteRead(absl::StripAsciiWhitespace(n.first), output);
    }
    return RewriteWrite(absl::StripAsciiWhitespace(n.first),
                        absl::StripAsciiWhitespace(n.second), output);
  }
  // Return true if an object was successfully added.
  bool AddObject(const std::string& name, Object object) {
    std::string new_name = name_func_(name);
    return name_to_object_.insert({name, {new_name, std::move(object)}}).second;
  }
  // Returns a collection of registered objects with updated names.
  std::vector<std::pair<std::string, Object>> GetObjects() const {
    std::vector<std::pair<std::string, Object>> objects;
    objects.reserve(name_to_object_.size());
    for (auto& o : name_to_object_) {
      objects.push_back(o.second);
    }
    return objects;
  }
 private:
  RewriteStatus RewriteRead(absl::string_view location, std::string* output) {
    auto element = object_accessor_internal::ParseElement(location);
    if (element.object_name.empty()) {
      absl::StrAppend(output, "UNABLE_TO_PARSE_INDEXED_ELEMENT");
      return RewriteStatus::ERROR;
    }
    auto it = name_to_object_.find(
        std::string(element.object_name.data(), element.object_name.size()));
    if (it == name_to_object_.end()) {
      return RewriteStatus::NOT_RECOGNIZED;
    }
    absl::StrAppend(output, inline_delimiter_, it->second.first, "[",
                    absl::StrJoin(element.indices, ","), "]",
                    inline_delimiter_);
    return RewriteStatus::SUCCESS;
  }
  RewriteStatus RewriteWrite(absl::string_view location,
                             absl::string_view value, std::string* output) {
    // name[index1, index2...] = value
    auto element = object_accessor_internal::ParseElement(location);
    if (element.object_name.empty()) {
      absl::StrAppend(output, "UNABLE_TO_PARSE_INDEXED_ELEMENT");
      return RewriteStatus::ERROR;
    }
    auto it = name_to_object_.find(
        std::string(element.object_name.data(), element.object_name.size()));
    if (it == name_to_object_.end()) {
      return RewriteStatus::NOT_RECOGNIZED;
    }
    absl::StrAppend(output, inline_delimiter_, it->second.first, "[",
                    absl::StrJoin(element.indices, ","), "] = ", value,
                    inline_delimiter_);
    return RewriteStatus::SUCCESS;
  }
  const std::string inline_delimiter_;
  const NameFunctor name_func_;
  std::unordered_map<std::string, std::pair<std::string, Object>>
      name_to_object_;
 };
 }  // namespace
 Status Rename(const NameFunctor& name_func, GeneratedCode* code) {
  ParameterRewriter param_rewriter("$", name_func);
  ObjectRewriter object_rewriter("$", name_func);
  for (auto&& param : code->parameters) {
    if (!param_rewriter.AddParameter(std::move(param))) {
      return InternalError("Parameter name already exists");
    }
  }
  for (auto&& object : code->objects) {
    if (!object_rewriter.AddObject(object.first, std::move(object.second))) {
      return InternalError("Object name already exists");
    }
  }
  TextPreprocessor preprocessor('$', /* keep_unknown_rewrites = */ true);
  preprocessor.AddRewrite(&param_rewriter);
  preprocessor.AddRewrite(&object_rewriter);
  std::string source_code;
  RETURN_IF_ERROR(preprocessor.Rewrite(code->source_code, &source_code));
  code->source_code = source_code;
  code->parameters = param_rewriter.GetUniformParameters();
  code->objects = object_rewriter.GetObjects();
  return OkStatus();
 }
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/gl/compiler/rename.h
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/rename.h
@ -0,0 +1,41 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_RENAME_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_RENAME_H_
 #include <functional>
 #include <string>
 #include "absl/strings/string_view.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 // Functor takes old name and returns new name.
 using NameFunctor = std::function<std::string(absl::string_view name)>;
 // Rewrites source code, objects and parameters with the new names supplied
 // by the given functor.
 Status Rename(const NameFunctor& name_func, GeneratedCode* code);
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_RENAME_H_
--- a/tensorflow/lite/delegates/gpu/gl/compiler/shader_code.h
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/shader_code.h
@ -0,0 +1,68 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_SHADER_CODE_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_SHADER_CODE_H_
 #include <string>
 #include <vector>
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/gl/object.h"
 #include "tensorflow/lite/delegates/gpu/gl/uniform_parameter.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 struct ShaderCode {
  ShaderCode() = default;
  ShaderCode(const std::vector<UniformParameter>& in_parameters,
             const std::vector<Object>& in_objects, const uint3& in_workload,
             const uint3& in_recommended_workgroup,
             const std::string& in_source_code,
             const std::vector<NodeId>& in_node_indices)
      : parameters(in_parameters),
        objects(in_objects),
        workload(in_workload),
        recommended_workgroup(in_recommended_workgroup),
        source_code(in_source_code),
        node_indices(in_node_indices) {}
  // A list of uniform parameters to be set.
  std::vector<UniformParameter> parameters;
  // A list of objects to bind to opengl program.
  std::vector<Object> objects;
  uint3 workload;
  // operation may specify recommended workgroup size
  uint3 recommended_workgroup;
  // Generated source code does not set local size, therefore it needs to be set
  // elsewhere.
  std::string source_code;
  // nodes of the graph that are covered by the shader.
  std::vector<NodeId> node_indices;
 };
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_SHADER_CODE_H_
--- a/tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.cc
@ -0,0 +1,148 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.h"
 #include <algorithm>
 #include "absl/strings/str_cat.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/preprocessor.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 ShaderCodegen::ShaderCodegen(const CompilationOptions& options,
                             const GpuInfo& gpu_info)
    : options_(options), gpu_type_(gpu_info.type) {}
 Status ShaderCodegen::Build(CompiledNodeAttributes attr,
                            ShaderCode* shader_code) const {
  ParameterAccessor parameters(options_.inline_parameters);
  ObjectAccessor objects(gpu_type_ == GpuType::MALI, &parameters);
  auto add_object = [&](const std::string& name, Object&& object) {
    if (!objects.AddObject(name, std::forward<Object>(object))) {
      return InternalError("There is an object with the same name");
    }
    return OkStatus();
  };
  auto add_parameter = [&](UniformParameter&& param) {
    if (!parameters.AddParameter(std::forward<UniformParameter>(param))) {
      return InternalError("There is a parameter with the same name");
    }
    return OkStatus();
  };
  for (auto&& param : attr.code.parameters) {
    RETURN_IF_ERROR(add_parameter(std::move(param)));
  }
  for (auto&& object : attr.code.objects) {
    RETURN_IF_ERROR(add_object(object.first, std::move(object.second)));
  }
  int index = 0;
  for (auto&& input : attr.inputs) {
    RETURN_IF_ERROR(
        add_object(absl::StrCat("input_data_", index++), std::move(input)));
  }
  index = 0;
  for (auto&& output : attr.outputs) {
    RETURN_IF_ERROR(
        add_object(absl::StrCat("output_data_", index++), std::move(output)));
  }
  // TODO(akulik): workload params need to go away and be replaced with
  // output_data_0_w
  RETURN_IF_ERROR(add_parameter(
      {"workload_x", static_cast<int32_t>(attr.code.workload.x)}));
  RETURN_IF_ERROR(add_parameter(
      {"workload_y", static_cast<int32_t>(attr.code.workload.y)}));
  RETURN_IF_ERROR(add_parameter(
      {"workload_z", static_cast<int32_t>(attr.code.workload.z)}));
  std::string source_code = R"(
  ivec3 gid = ivec3(gl_GlobalInvocationID.xyz);
  if (gid.x >= $workload_x$ || gid.y >= $workload_y$ || gid.z >= $workload_z$) {
    return;
  }
 )";
  switch (attr.code.input) {
    case IOStructure::ONLY_DEFINITIONS:
      for (int i = 0; i < attr.inputs.size(); ++i) {
        absl::StrAppend(&source_code, "  highp vec4 value_", i,
                        " = vec4(0);\n");
      }
      break;
    case IOStructure::AUTO: {
      for (int i = 0; i < attr.inputs.size(); ++i) {
        absl::StrAppend(&source_code, "  highp vec4 value_", i,
                        " = $input_data_", i, "[gid.x, gid.y, gid.z]$;\n");
      }
      break;
    }
  }
  source_code.append(attr.code.source_code);
  if (attr.code.output == IOStructure::AUTO) {
    for (int i = 0; i < attr.outputs.size(); ++i) {
      absl::StrAppend(&source_code, "  $output_data_", i,
                      "[gid.x, gid.y, gid.z] = value_", i, "$;\n");
    }
  }
  // At this point main function is already generated. Now we need to process
  // object and parameter accessors.
  // process objects first. Object accessor may introduce new uniform
  // parameters that need to be rewritten in the subsequent pass.
  {
    TextPreprocessor preprocessor('$', /*keep_unknown_rewrites=*/true);
    preprocessor.AddRewrite(&objects);
    RETURN_IF_ERROR(preprocessor.Rewrite(source_code, &source_code));
  }
  {
    TextPreprocessor preprocessor('$', /*keep_unknown_rewrites=*/false);
    preprocessor.AddRewrite(&parameters);
    RETURN_IF_ERROR(preprocessor.Rewrite(source_code, &source_code));
  }
  if (options_.inline_parameters) {
    source_code = absl::StrCat(parameters.GetConstDeclarations(), source_code);
  }
  std::string declarations = absl::StrCat(
      objects.GetFunctionsDeclarations(), "\n", objects.GetObjectDeclarations(),
      "\n", parameters.GetUniformDeclarations());
  *shader_code = ShaderCode(
      parameters.GetUniformParameters(), objects.GetObjects(),
      attr.code.workload, attr.code.workgroup,
      absl::StrCat("layout(std430) buffer;\nprecision ",
                   (options_.allow_precision_loss ? "mediump" : "highp"),
                   " float;\n", declarations, "\nvoid main() {\n", source_code,
                   "\n}"),
      attr.node_indices);
  return OkStatus();
 }
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.h
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.h
@ -0,0 +1,54 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_SHADER_CODEGEN_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_SHADER_CODEGEN_H_
 #include <string>
 #include <vector>
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/compiled_node.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/parameter_accessor.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/shader_code.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler_options.h"
 #include "tensorflow/lite/delegates/gpu/gl/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/gl/object.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 // This class is responsible for assembling a shader by putting together
 // objects, parameters declarations and main function.
 class ShaderCodegen {
 public:
  ShaderCodegen(const CompilationOptions& options, const GpuInfo& gpu_info);
  // Builds final program representation.
  Status Build(CompiledNodeAttributes attr, ShaderCode* shader_code) const;
 private:
  const CompilationOptions options_;
  const GpuType gpu_type_;
 };
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_SHADER_CODEGEN_H_
--- a/tensorflow/lite/delegates/gpu/gl/compiler_options.h
+++ b/tensorflow/lite/delegates/gpu/gl/compiler_options.h
@ -0,0 +1,68 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_OPTIONS_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_OPTIONS_H_
 #include "tensorflow/lite/delegates/gpu/gl/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/gl/object.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 // Default constructor for options turns on all optimizations.
 struct CompilationOptions {
  // Allows to quantify tensors, downcast values, process in float16 etc.
  bool allow_precision_loss = false;
  // When set few operations are fused into a single shader. Therefore, there
  // will be less shaders, but each shader will become larger.
  bool fuse_operations = true;
  // Parameters will be inlined into a shader. This in turn will generated more
  // unique shaders where each will need to be compiled.
  bool inline_parameters = false;
  // If true, shaders, that have auto-input and auto-output, will use a single
  // object for reading and writing.
  bool inline_objects = true;  // TODO(akulik): unsupported
  // Can be only Textures or Buffers
  ObjectType preferred_obj_type = ObjectType::UNKNOWN;
  // User has an option to choose between textures and buffers. Textures work
  // better on Adreno and buffers are better for Mali.
  // Chooses object type to represent intermediate tensors. Buffers have more
  // efficient memory usage because they represent opaque memory blob, but
  // textures work better on Adreno.
  // TODO(akulik): may be better name?
  ObjectType ref_obj_type = ObjectType::UNKNOWN;
  // If true, a user may change BATCH dimension at runtime. Otherwise, static
  // batch size will be fixed during compile time.
  // Dynamic mode uses less memory, while static mode may yield better
  // performance for small models.
  bool dynamic_batch = false;
  // Fuses consequent nodes which have auto output and auto input.
  bool auto_input_fusion = true;
 };
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_OPTIONS_H_
--- a/tensorflow/lite/delegates/gpu/gl/converters/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/converters/BUILD
@ -0,0 +1,102 @@
 package(default_visibility = ["//visibility:public"])
 licenses(["notice"])  # Apache 2.0
 cc_library(
    name = "util",
    hdrs = ["util.h"],
    deps = [
        "//tensorflow/lite/delegates/gpu/common:shape",
        "//tensorflow/lite/delegates/gpu/common:types",
        "//tensorflow/lite/delegates/gpu/common:util",
        "@com_google_absl//absl/strings",
    ],
 )
 cc_library(
    name = "bhwc_to_phwc4",
    srcs = ["bhwc_to_phwc4.cc"],
    hdrs = ["bhwc_to_phwc4.h"],
    deps = [
        ":util",
        "//tensorflow/lite/delegates/gpu/common:shape",
        "//tensorflow/lite/delegates/gpu/common:status",
        "//tensorflow/lite/delegates/gpu/common:types",
        "//tensorflow/lite/delegates/gpu/common:util",
        "//tensorflow/lite/delegates/gpu/gl:command_queue",
        "//tensorflow/lite/delegates/gpu/gl:gl_buffer",
        "//tensorflow/lite/delegates/gpu/gl:gl_program",
        "//tensorflow/lite/delegates/gpu/gl:gl_shader",
        "//tensorflow/lite/delegates/gpu/gl:uniform_parameter",
    ],
 )
 cc_test(
    name = "bhwc_to_phwc4_test",
    size = "small",
    srcs = ["bhwc_to_phwc4_test.cc"],
    linkopts = [
        "-lGLESv3",
        "-lEGL",
    ],
    tags = [
        "local",
        "nobuilder",
        "notap",
    ],
    deps = [
        ":bhwc_to_phwc4",
        "//tensorflow/lite/delegates/gpu/common:convert",
        "//tensorflow/lite/delegates/gpu/common:shape",
        "//tensorflow/lite/delegates/gpu/common:status",
        "//tensorflow/lite/delegates/gpu/gl:egl_environment",
        "//tensorflow/lite/delegates/gpu/gl:gl_buffer",
        "//tensorflow/lite/delegates/gpu/gl:portable",
        "@com_google_absl//absl/types:span",
        "@com_google_googletest//:gtest_main",
    ],
 )
 cc_library(
    name = "phwc4_to_bhwc",
    srcs = ["phwc4_to_bhwc.cc"],
    hdrs = ["phwc4_to_bhwc.h"],
    deps = [
        ":util",
        "//tensorflow/lite/delegates/gpu/common:shape",
        "//tensorflow/lite/delegates/gpu/common:status",
        "//tensorflow/lite/delegates/gpu/common:types",
        "//tensorflow/lite/delegates/gpu/common:util",
        "//tensorflow/lite/delegates/gpu/gl:command_queue",
        "//tensorflow/lite/delegates/gpu/gl:gl_buffer",
        "//tensorflow/lite/delegates/gpu/gl:gl_program",
        "//tensorflow/lite/delegates/gpu/gl:gl_shader",
        "//tensorflow/lite/delegates/gpu/gl:uniform_parameter",
    ],
 )
 cc_test(
    name = "phwc4_to_bhwc_test",
    size = "small",
    srcs = ["phwc4_to_bhwc_test.cc"],
    linkopts = [
        "-lGLESv3",
        "-lEGL",
    ],
    tags = [
        "local",
        "nobuilder",
        "notap",
    ],
    deps = [
        ":phwc4_to_bhwc",
        "//tensorflow/lite/delegates/gpu/common:convert",
        "//tensorflow/lite/delegates/gpu/common:shape",
        "//tensorflow/lite/delegates/gpu/common:status",
        "//tensorflow/lite/delegates/gpu/gl:egl_environment",
        "//tensorflow/lite/delegates/gpu/gl:gl_buffer",
        "//tensorflow/lite/delegates/gpu/gl:portable",
        "@com_google_absl//absl/types:span",
        "@com_google_googletest//:gtest_main",
    ],
 )
--- a/tensorflow/lite/delegates/gpu/gl/converters/bhwc_to_phwc4.cc
+++ b/tensorflow/lite/delegates/gpu/gl/converters/bhwc_to_phwc4.cc
@ -0,0 +1,106 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/gl/converters/bhwc_to_phwc4.h"
 #include <algorithm>
 #include <cstdint>
 #include <string>
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/common/util.h"
 #include "tensorflow/lite/delegates/gpu/gl/converters/util.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_program.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_shader.h"
 #include "tensorflow/lite/delegates/gpu/gl/uniform_parameter.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 Status ConverterBhwcToPhwc4::Create(ConverterBhwcToPhwc4* converter) {
  uint3 workgroup_size = uint3(4, 4, 4);
  std::string shader_source = GetShaderHeader(workgroup_size) + R"(
    layout(std430) buffer;
    precision highp float;
    layout(binding = 0) readonly buffer B0 {
      float elements[];
    } input_data;
    layout(binding = 1) writeonly buffer B1 {
      vec4 elements[];
    } output_data;
    uniform ivec4 sizes_;
    void main() {
      ivec3 gid = ivec3(gl_GlobalInvocationID.xyz);
      if (gid.x >= sizes_.x || gid.y >= sizes_.y || gid.z >= sizes_.z) {
        return;
      }
      vec4 v = vec4(0);
      int dst_channel = gid.z * 4;
      int index = (gid.y * sizes_.x + gid.x) * sizes_.w + dst_channel;
      for (int i = 0; i < 4; ++i, ++index, ++dst_channel) {
        if (dst_channel >= sizes_.w) break;
        v[i] = input_data.elements[index];
      }
      output_data.elements[(gid.z * sizes_.y + gid.y) * sizes_.x + gid.x] = v;
    })";
  GlShader shader;
  RETURN_IF_ERROR(
      GlShader::CompileShader(GL_COMPUTE_SHADER, shader_source, &shader));
  GlProgram program;
  RETURN_IF_ERROR(GlProgram::CreateWithShader(shader, &program));
  *converter = ConverterBhwcToPhwc4(std::move(program), workgroup_size);
  return OkStatus();
 }
 Status ConverterBhwcToPhwc4::Convert(const BHWC& shape, const GlBuffer& source,
                                     CommandQueue* command_queue,
                                     GlBuffer* destination) {
  if (source.bytes_size() < BytesForBHWC(shape)) {
    return InvalidArgumentError(
        "BhwcToPhwc4: Input data size does not match expected size.");
  }
  if (destination->bytes_size() < BytesForPHWC4(shape)) {
    return InvalidArgumentError(
        "BhwcToPhwc4: output data size does not match expected size.");
  }
  if (shape.b != 1) {
    return UnimplementedError("BhwcToPhwc4: Batch size is not equal to 1.");
  }
  uint3 workload = uint3(shape.w, shape.h, shape.c);
  uint3 num_workgroups = IntegralDivideRoundUp(workload, workgroup_size_);
  RETURN_IF_ERROR(program_.SetParameter(UniformParameter{
      "sizes_",
      int4(static_cast<int32_t>(workload.x), static_cast<int32_t>(workload.y),
           static_cast<int32_t>(workload.z), static_cast<int32_t>(shape.c))}));
  RETURN_IF_ERROR(source.BindToIndex(0));
  RETURN_IF_ERROR(destination->BindToIndex(1));
  if (command_queue) {
    return command_queue->Dispatch(program_, num_workgroups);
  }
  return program_.Dispatch(num_workgroups);
 }
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/gl/converters/bhwc_to_phwc4.h
+++ b/tensorflow/lite/delegates/gpu/gl/converters/bhwc_to_phwc4.h
@ -0,0 +1,53 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_CONVERTERS_BHWC_TO_PHWC4_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_CONVERTERS_BHWC_TO_PHWC4_H_
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/gl/command_queue.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_buffer.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_program.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 class ConverterBhwcToPhwc4 {
 public:
  // Creates invalid object.
  ConverterBhwcToPhwc4() : program_(), workgroup_size_() {}
  static Status Create(ConverterBhwcToPhwc4* converter);
  Status Convert(const BHWC& shape, const GlBuffer& source,
                 CommandQueue* command_queue /* optional */,
                 GlBuffer* destination);
 private:
  explicit ConverterBhwcToPhwc4(GlProgram program, const uint3& workgroup_size)
      : program_(std::move(program)), workgroup_size_(workgroup_size) {}
  GlProgram program_;
  uint3 workgroup_size_;
 };
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_CONVERTERS_BHWC_TO_PHWC4_H_
--- a/tensorflow/lite/delegates/gpu/gl/converters/bhwc_to_phwc4_test.cc
+++ b/tensorflow/lite/delegates/gpu/gl/converters/bhwc_to_phwc4_test.cc
@ -0,0 +1,94 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/gl/converters/bhwc_to_phwc4.h"
 #include <algorithm>
 #include <vector>
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/types/span.h"
 #include "tensorflow/lite/delegates/gpu/common/convert.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/gl/egl_environment.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_buffer.h"
 #include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 namespace {
 inline std::vector<float> GenerateFloats(float multiplier, int size) {
  std::vector<float> v(size);
  for (int i = 0; i < size; ++i) {
    v[i] = multiplier * i * (i % 2 == 0 ? -1 : 1);
  }
  return v;
 }
 Status RunTest(const BHWC& shape) {
  // Create random input and calculate expected output for it.
  std::vector<float> input = GenerateFloats(0.01, shape.DimensionsProduct());
  std::vector<float> output(GetElementsSizeForPHWC4(shape), 0);
  RETURN_IF_ERROR(
      ConvertToPHWC4(absl::MakeConstSpan(input.data(), input.size()), shape,
                     absl::MakeSpan(output.data(), output.size())));
  std::unique_ptr<EglEnvironment> env;
  RETURN_IF_ERROR(EglEnvironment::NewEglEnvironment(&env));
  // Create input and output buffers
  GlBuffer input_buffer;
  RETURN_IF_ERROR(CreateReadOnlyShaderStorageBuffer(
      absl::MakeConstSpan(input.data(), input.size()), &input_buffer));
  GlBuffer output_buffer;
  RETURN_IF_ERROR(CreateReadWriteShaderStorageBuffer<float>(
      GetElementsSizeForPHWC4(shape), &output_buffer));
  // Create converter and run it.
  ConverterBhwcToPhwc4 converter;
  RETURN_IF_ERROR(ConverterBhwcToPhwc4::Create(&converter));
  RETURN_IF_ERROR(
      converter.Convert(shape, input_buffer, nullptr, &output_buffer));
  std::vector<float> converted_output(output.size(), 0);
  RETURN_IF_ERROR(output_buffer.Read(
      absl::MakeSpan(converted_output.data(), converted_output.size())));
  if (output != converted_output) {
    return InternalError("Outputs don't match");
  }
  return OkStatus();
 }
 TEST(HwcToPhwc4, Smoke) {
  for (int32_t h : {1, 2, 3, 7, 20}) {
    for (int32_t w : {1, 2, 4, 5, 11}) {
      for (int32_t c : {1, 2, 4, 5, 8, 9}) {
        BHWC shape(1, h, w, c);
        EXPECT_TRUE(RunTest(shape).ok())
            << shape.h << " " << shape.w << " " << shape.c;
      }
    }
  }
 }
 }  // namespace
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/gl/converters/phwc4_to_bhwc.cc
+++ b/tensorflow/lite/delegates/gpu/gl/converters/phwc4_to_bhwc.cc
@ -0,0 +1,102 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/gl/converters/phwc4_to_bhwc.h"
 #include <algorithm>
 #include <cstdint>
 #include <string>
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/common/util.h"
 #include "tensorflow/lite/delegates/gpu/gl/converters/util.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_program.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_shader.h"
 #include "tensorflow/lite/delegates/gpu/gl/uniform_parameter.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 Status ConverterPhwc4ToBhwc::Create(ConverterPhwc4ToBhwc* converter) {
  uint3 workgroup_size = uint3(4, 4, 4);
  std::string shader_source = GetShaderHeader(workgroup_size) + R"(
    layout(std430) buffer;
    precision highp float;
    layout(binding = 0) readonly buffer B0 {
      vec4 elements[];
    } input_data;
    layout(binding = 1) writeonly buffer B1 {
      float elements[];
    } output_data;
    uniform ivec4 sizes_;
    void main() {
      ivec3 gid = ivec3(gl_GlobalInvocationID.xyz);
      if (gid.x >= sizes_.x || gid.y >= sizes_.y || gid.z >= sizes_.z) {
        return;
      }
      output_data.elements[(gid.y * sizes_.x + gid.x) * sizes_.z + gid.z] = input_data.elements[(gid.z / 4 * sizes_.y + gid.y) * sizes_.x + gid.x][gid.z % 4];
    })";
  GlShader shader;
  RETURN_IF_ERROR(
      GlShader::CompileShader(GL_COMPUTE_SHADER, shader_source, &shader));
  GlProgram program;
  RETURN_IF_ERROR(GlProgram::CreateWithShader(shader, &program));
  *converter = ConverterPhwc4ToBhwc(std::move(program), workgroup_size);
  return OkStatus();
 }
 Status ConverterPhwc4ToBhwc::Convert(const BHWC& shape, const GlBuffer& source,
                                     CommandQueue* command_queue,
                                     GlBuffer* destination) {
  if (source.bytes_size() < BytesForPHWC4(shape)) {
    return InvalidArgumentError(
        "Phwc4ToBhwc: Input data size does not match expected size.");
  }
  if (destination->bytes_size() < BytesForBHWC(shape)) {
    return InvalidArgumentError(
        "Phwc4ToBhwc: output data size does not match expected size.");
  }
  if (shape.b != 1) {
    return UnimplementedError("Phwc4ToBhwc: Batch size is not equal to 1.");
  }
  uint3 workload = uint3(shape.w, shape.h, shape.c);
  uint3 num_workgroups = IntegralDivideRoundUp(workload, workgroup_size_);
  // TODO(akulik): simply pass workload as soon as UniformParameter
  // supports uint3
  RETURN_IF_ERROR(program_.SetParameter(UniformParameter{
      "sizes_",
      int4(static_cast<int32_t>(workload.x), static_cast<int32_t>(workload.y),
           static_cast<int32_t>(workload.z), 0)}));
  RETURN_IF_ERROR(source.BindToIndex(0));
  RETURN_IF_ERROR(destination->BindToIndex(1));
  if (command_queue) {
    return command_queue->Dispatch(program_, num_workgroups);
  }
  return program_.Dispatch(num_workgroups);
 }
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/gl/converters/phwc4_to_bhwc.h
+++ b/tensorflow/lite/delegates/gpu/gl/converters/phwc4_to_bhwc.h
@ -0,0 +1,53 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_CONVERTERS_PHWC4_TO_BHWC_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_CONVERTERS_PHWC4_TO_BHWC_H_
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/gl/command_queue.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_buffer.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_program.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 class ConverterPhwc4ToBhwc {
 public:
  // Creates invalid object.
  ConverterPhwc4ToBhwc() : program_(), workgroup_size_() {}
  static Status Create(ConverterPhwc4ToBhwc* converter);
  Status Convert(const BHWC& shape, const GlBuffer& source,
                 CommandQueue* command_queue /* optional */,
                 GlBuffer* destination);
 private:
  explicit ConverterPhwc4ToBhwc(GlProgram program, const uint3& workgroup_size)
      : program_(std::move(program)), workgroup_size_(workgroup_size) {}
  GlProgram program_;
  uint3 workgroup_size_;
 };
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_CONVERTERS_PHWC4_TO_BHWC_H_
--- a/tensorflow/lite/delegates/gpu/gl/converters/phwc4_to_bhwc_test.cc
+++ b/tensorflow/lite/delegates/gpu/gl/converters/phwc4_to_bhwc_test.cc
@ -0,0 +1,95 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/gl/converters/phwc4_to_bhwc.h"
 #include <algorithm>
 #include <vector>
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/types/span.h"
 #include "tensorflow/lite/delegates/gpu/common/convert.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/gl/egl_environment.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_buffer.h"
 #include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 namespace {
 inline std::vector<float> GenerateFloats(float multiplier, int size) {
  std::vector<float> v(size);
  for (int i = 0; i < size; ++i) {
    v[i] = multiplier * i * (i % 2 == 0 ? -1 : 1);
  }
  return v;
 }
 Status RunTest(const BHWC& shape) {
  // Create random input and calculate expected output for it.
  std::vector<float> input =
      GenerateFloats(0.01, GetElementsSizeForPHWC4(shape));
  std::vector<float> output(shape.DimensionsProduct(), 0);
  RETURN_IF_ERROR(
      ConvertFromPHWC4(absl::MakeConstSpan(input.data(), input.size()), shape,
                       absl::MakeSpan(output.data(), output.size())));
  std::unique_ptr<EglEnvironment> env;
  RETURN_IF_ERROR(EglEnvironment::NewEglEnvironment(&env));
  // Create input and output buffers
  GlBuffer input_buffer;
  RETURN_IF_ERROR(CreateReadOnlyShaderStorageBuffer(
      absl::MakeConstSpan(input.data(), input.size()), &input_buffer));
  GlBuffer output_buffer;
  RETURN_IF_ERROR(CreateReadWriteShaderStorageBuffer<float>(
      shape.DimensionsProduct(), &output_buffer));
  // Create converter and run it.
  ConverterPhwc4ToBhwc converter;
  RETURN_IF_ERROR(ConverterPhwc4ToBhwc::Create(&converter));
  RETURN_IF_ERROR(
      converter.Convert(shape, input_buffer, nullptr, &output_buffer));
  std::vector<float> converted_output(output.size(), 0);
  RETURN_IF_ERROR(output_buffer.Read(
      absl::MakeSpan(converted_output.data(), converted_output.size())));
  if (output != converted_output) {
    return InternalError("Outputs don't match");
  }
  return OkStatus();
 }
 TEST(Phwc4ToHwc, Smoke) {
  for (int32_t h : {1, 2, 3, 7, 20}) {
    for (int32_t w : {1, 2, 4, 5, 11}) {
      for (int32_t c : {1, 2, 4, 5, 8, 9}) {
        BHWC shape(1, h, w, c);
        EXPECT_TRUE(RunTest(shape).ok())
            << shape.h << " " << shape.w << " " << shape.c;
      }
    }
  }
 }
 }  // namespace
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/gl/converters/util.h
+++ b/tensorflow/lite/delegates/gpu/gl/converters/util.h
@ -0,0 +1,49 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_CONVERTERS_UTIL_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_CONVERTERS_UTIL_H_
 #include <cstdint>
 #include <string>
 #include "absl/strings/str_cat.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/common/util.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 inline std::string GetShaderHeader(const uint3& localsize) {
  return absl::StrCat("#version 310 es\nlayout(local_size_x = ", localsize.x,
                      ", local_size_y = ", localsize.y,
                      ", local_size_z = ", localsize.z, ") in;\n");
 }
 inline uint32_t BytesForPHWC4(const BHWC& shape) {
  return shape.b * shape.h * shape.w * AlignByN(shape.c, 4) * sizeof(float);
 }
 inline uint32_t BytesForBHWC(const BHWC& shape) {
  return shape.DimensionsProduct() * sizeof(float);
 }
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_CONVERTERS_UTIL_H_
--- a/tensorflow/lite/delegates/gpu/gl/egl_context.cc
+++ b/tensorflow/lite/delegates/gpu/gl/egl_context.cc
@ -0,0 +1,143 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/gl/egl_context.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_call.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_errors.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 namespace {
 Status GetConfig(EGLDisplay display, const EGLint* attributes,
                 EGLConfig* config) {
  EGLint config_count;
  bool chosen = eglChooseConfig(display, attributes, config, 1, &config_count);
  RETURN_IF_ERROR(GetOpenGlErrors());
  if (!chosen || config_count == 0) {
    return InternalError("No EGL error, but eglChooseConfig failed.");
  }
  return OkStatus();
 }
 Status CreateContext(EGLDisplay display, EGLContext shared_context,
                     EGLConfig config, EglContext* egl_context) {
  static const EGLint attributes[] = {EGL_CONTEXT_CLIENT_VERSION, 3,
 #ifdef _DEBUG  // Add debugging bit
                                      EGL_CONTEXT_FLAGS_KHR,
                                      EGL_CONTEXT_OPENGL_DEBUG_BIT_KHR,
 #endif
                                      EGL_NONE};
  EGLContext context =
      eglCreateContext(display, config, shared_context, attributes);
  RETURN_IF_ERROR(GetOpenGlErrors());
  if (context == EGL_NO_CONTEXT) {
    return InternalError("No EGL error, but eglCreateContext failed.");
  }
  *egl_context = EglContext(context, display, config);
  return OkStatus();
 }
 bool HasExtension(EGLDisplay display, const char* name) {
  return strstr(eglQueryString(display, EGL_EXTENSIONS), name);
 }
 }  // namespace
 void EglContext::Invalidate() {
  if (context_ != EGL_NO_CONTEXT) {
    eglMakeCurrent(display_, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT);
    eglDestroyContext(display_, context_);
    context_ = EGL_NO_CONTEXT;
  }
 }
 EglContext::EglContext(EglContext&& other)
    : context_(other.context_),
      display_(other.display_),
      config_(other.config_) {
  other.context_ = EGL_NO_CONTEXT;
 }
 EglContext& EglContext::operator=(EglContext&& other) {
  if (this != &other) {
    Invalidate();
    std::swap(context_, other.context_);
    display_ = other.display_;
    config_ = other.config_;
  }
  return *this;
 }
 Status EglContext::MakeCurrent(EGLSurface read, EGLSurface write) {
  bool is_made_current = eglMakeCurrent(display_, write, read, context_);
  RETURN_IF_ERROR(GetOpenGlErrors());
  if (!is_made_current) {
    return InternalError("No EGL error, but eglMakeCurrent failed.");
  }
  return OkStatus();
 }
 bool EglContext::IsCurrent() const {
  return context_ == eglGetCurrentContext();
 }
 Status CreateConfiglessContext(EGLDisplay display, EGLContext shared_context,
                               EglContext* egl_context) {
  if (!HasExtension(display, "EGL_KHR_no_config_context")) {
    return UnavailableError("EGL_KHR_no_config_context not supported");
  }
  return CreateContext(display, shared_context, EGL_NO_CONFIG_KHR, egl_context);
 }
 Status CreateSurfacelessContext(EGLDisplay display, EGLContext shared_context,
                                EglContext* egl_context) {
  if (!HasExtension(display, "EGL_KHR_create_context")) {
    return UnavailableError("EGL_KHR_create_context not supported");
  }
  if (!HasExtension(display, "EGL_KHR_surfaceless_context")) {
    return UnavailableError("EGL_KHR_surfaceless_context not supported");
  }
  const EGLint attributes[] = {EGL_RENDERABLE_TYPE, EGL_OPENGL_ES3_BIT_KHR,
                               EGL_NONE};
  EGLConfig config;
  RETURN_IF_ERROR(GetConfig(display, attributes, &config));
  return CreateContext(display, shared_context, config, egl_context);
 }
 Status CreatePBufferContext(EGLDisplay display, EGLContext shared_context,
                            EglContext* egl_context) {
  const EGLint attributes[] = {EGL_SURFACE_TYPE,
                               EGL_PBUFFER_BIT,
                               EGL_BLUE_SIZE,
                               8,
                               EGL_GREEN_SIZE,
                               8,
                               EGL_RED_SIZE,
                               8,
                               EGL_RENDERABLE_TYPE,
                               EGL_OPENGL_ES3_BIT_KHR,
                               EGL_NONE};
  EGLConfig config;
  RETURN_IF_ERROR(GetConfig(display, attributes, &config));
  return CreateContext(display, shared_context, config, egl_context);
 }
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/gl/egl_context.h
+++ b/tensorflow/lite/delegates/gpu/gl/egl_context.h
@ -0,0 +1,93 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_EGL_CONTEXT_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_EGL_CONTEXT_H_
 #include <vector>
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/gl/portable_egl.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 // EglContext is an RAII wrapper for an EGLContext.
 //
 // EglContext is moveable but not copyable.
 //
 // See https://www.khronos.org/registry/EGL/sdk/docs/man/html/eglIntro.xhtml for
 // more info.
 class EglContext {
 public:
  // Creates an invalid EglContext.
  EglContext()
      : context_(EGL_NO_CONTEXT),
        display_(EGL_NO_DISPLAY),
        config_(EGL_NO_CONFIG_KHR) {}
  EglContext(EGLContext context, EGLDisplay display, EGLConfig config)
      : context_(context), display_(display), config_(config) {}
  // Move only
  EglContext(EglContext&& other);
  EglContext& operator=(EglContext&& other);
  EglContext(const EglContext&) = delete;
  EglContext& operator=(const EglContext&) = delete;
  ~EglContext() { Invalidate(); }
  EGLContext context() const { return context_; }
  EGLDisplay display() const { return display_; }
  EGLConfig config() const { return config_; }
  // Make this EglContext the current EGL context on this thread, replacing
  // the existing current.
  Status MakeCurrent(EGLSurface read, EGLSurface write);
  Status MakeCurrentSurfaceless() {
    return MakeCurrent(EGL_NO_SURFACE, EGL_NO_SURFACE);
  }
  // Returns true if this is the currently bound EGL context.
  bool IsCurrent() const;
 private:
  void Invalidate();
  EGLContext context_;
  EGLDisplay display_;
  EGLConfig config_;
 };
 // It uses the EGL_KHR_no_config_context extension to create a no config context
 // since most modern hardware supports the extension.
 Status CreateConfiglessContext(EGLDisplay display, EGLContext shared_context,
                               EglContext* egl_context);
 Status CreateSurfacelessContext(EGLDisplay display, EGLContext shared_context,
                                EglContext* egl_context);
 Status CreatePBufferContext(EGLDisplay display, EGLContext shared_context,
                            EglContext* egl_context);
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_EGL_CONTEXT_H_
--- a/tensorflow/lite/delegates/gpu/gl/egl_environment.cc
+++ b/tensorflow/lite/delegates/gpu/gl/egl_environment.cc
@ -0,0 +1,149 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/gl/egl_environment.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_call.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 namespace {
 // TODO(akulik): detect power management event when all contexts are destroyed
 // and OpenGL ES is reinitialized. See eglMakeCurrent
 Status InitDisplay(EGLDisplay* egl_display) {
  RETURN_IF_ERROR(
      TFLITE_GPU_CALL_EGL(eglGetDisplay, egl_display, EGL_DEFAULT_DISPLAY));
  if (*egl_display == EGL_NO_DISPLAY) {
    return UnavailableError("eglGetDisplay returned nullptr");
  }
  bool is_initialized;
  RETURN_IF_ERROR(TFLITE_GPU_CALL_EGL(eglInitialize, &is_initialized,
                                      *egl_display, nullptr, nullptr));
  if (!is_initialized) {
    return InternalError("No EGL error, but eglInitialize failed");
  }
  return OkStatus();
 }
 }  // namespace
 Status EglEnvironment::NewEglEnvironment(
    std::unique_ptr<EglEnvironment>* egl_environment) {
  *egl_environment = absl::make_unique<EglEnvironment>();
  RETURN_IF_ERROR((*egl_environment)->Init());
  return OkStatus();
 }
 EglEnvironment::~EglEnvironment() {
  if (dummy_framebuffer_ != GL_INVALID_INDEX) {
    glDeleteFramebuffers(1, &dummy_framebuffer_);
  }
  if (dummy_texture_ != GL_INVALID_INDEX) {
    glDeleteTextures(1, &dummy_texture_);
  }
 }
 Status EglEnvironment::Init() {
  bool is_bound;
  RETURN_IF_ERROR(
      TFLITE_GPU_CALL_EGL(eglBindAPI, &is_bound, EGL_OPENGL_ES_API));
  if (!is_bound) {
    return InternalError("No EGL error, but eglBindAPI failed");
  }
  // Re-use context and display if it was created on this thread.
  if (eglGetCurrentContext() != EGL_NO_CONTEXT) {
    display_ = eglGetCurrentDisplay();
    context_ = EglContext(eglGetCurrentContext(), display_, EGL_NO_CONFIG_KHR);
  } else {
    RETURN_IF_ERROR(InitDisplay(&display_));
    Status status = InitConfiglessContext();
    if (!status.ok()) {
      status = InitSurfacelessContext();
    }
    if (!status.ok()) {
      status = InitPBufferContext();
    }
    if (!status.ok()) {
      return status;
    }
  }
  if (gpu_info_.type == GpuType::UNKNOWN) {
    RETURN_IF_ERROR(RequestGpuInfo(&gpu_info_));
  }
  // TODO(akulik): when do we need ForceSyncTurning?
  ForceSyncTurning();
  return OkStatus();
 }
 Status EglEnvironment::InitConfiglessContext() {
  RETURN_IF_ERROR(CreateConfiglessContext(display_, EGL_NO_CONTEXT, &context_));
  return context_.MakeCurrentSurfaceless();
 }
 Status EglEnvironment::InitSurfacelessContext() {
  RETURN_IF_ERROR(
      CreateSurfacelessContext(display_, EGL_NO_CONTEXT, &context_));
  Status status = context_.MakeCurrentSurfaceless();
  if (!status.ok()) {
    return status;
  }
  // PowerVR support EGL_KHR_surfaceless_context, but glFenceSync crashes on
  // PowerVR when it is surface-less.
  RETURN_IF_ERROR(RequestGpuInfo(&gpu_info_));
  if (gpu_info_.type == GpuType::POWERVR) {
    return UnavailableError(
        "Surface-less context is not properly supported on powervr.");
  }
  return OkStatus();
 }
 Status EglEnvironment::InitPBufferContext() {
  RETURN_IF_ERROR(CreatePBufferContext(display_, EGL_NO_CONTEXT, &context_));
  RETURN_IF_ERROR(CreatePbufferRGBSurface(context_.config(), display_, 1, 1,
                                          &surface_read_));
  RETURN_IF_ERROR(CreatePbufferRGBSurface(context_.config(), display_, 1, 1,
                                          &surface_draw_));
  return context_.MakeCurrent(surface_read_.surface(), surface_draw_.surface());
 }
 void EglEnvironment::ForceSyncTurning() {
  glGenFramebuffers(1, &dummy_framebuffer_);
  glBindFramebuffer(GL_FRAMEBUFFER, dummy_framebuffer_);
  glGenTextures(1, &dummy_texture_);
  glBindTexture(GL_TEXTURE_2D, dummy_texture_);
  glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8, 4, 4);
  glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D,
                         dummy_texture_, 0);
  GLenum draw_buffers[1] = {GL_COLOR_ATTACHMENT0};
  glDrawBuffers(1, draw_buffers);
  glViewport(0, 0, 4, 4);
  glClear(GL_COLOR_BUFFER_BIT);
 }
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/gl/egl_environment.h
+++ b/tensorflow/lite/delegates/gpu/gl/egl_environment.h
@ -0,0 +1,72 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_EGL_ENVIRONMENT_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_EGL_ENVIRONMENT_H_
 #include <memory>
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/gl/egl_context.h"
 #include "tensorflow/lite/delegates/gpu/gl/egl_surface.h"
 #include "tensorflow/lite/delegates/gpu/gl/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/gl/portable_egl.h"
 #include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 // Class encapsulates creation of OpenGL objects needed before starting working
 // with OpenGL: binds OpenGL ES API, creates new EGL context, binds it to EGL
 // display and creates surfaces if needed.
 //
 // EGL environment needs to be created once per thread.
 class EglEnvironment {
 public:
  static Status NewEglEnvironment(
      std::unique_ptr<EglEnvironment>* egl_environment);
  EglEnvironment() = default;
  ~EglEnvironment();
  const EglContext& context() const { return context_; }
  EGLDisplay display() const { return display_; }
  const GpuInfo& gpu_info() const { return gpu_info_; }
 private:
  Status Init();
  Status InitConfiglessContext();
  Status InitSurfacelessContext();
  Status InitPBufferContext();
  EGLDisplay display_ = EGL_NO_DISPLAY;
  EglContext context_;
  EglSurface surface_draw_;
  EglSurface surface_read_;
  GpuInfo gpu_info_;
  // Strange hack that helps on Mali GPUs
  // without it glFinish and glFenceSync don't work
  void ForceSyncTurning();
  GLuint dummy_framebuffer_ = GL_INVALID_INDEX;
  GLuint dummy_texture_ = GL_INVALID_INDEX;
 };
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_EGL_ENVIRONMENT_H_
--- a/tensorflow/lite/delegates/gpu/gl/egl_surface.cc
+++ b/tensorflow/lite/delegates/gpu/gl/egl_surface.cc
@ -0,0 +1,71 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/gl/egl_surface.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_call.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_errors.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 EglSurface::EglSurface(EglSurface&& other)
    : surface_(other.surface_), display_(other.display_) {
  other.surface_ = EGL_NO_SURFACE;
 }
 EglSurface& EglSurface::operator=(EglSurface&& other) {
  if (this != &other) {
    display_ = other.display_;
    Invalidate();
    std::swap(surface_, other.surface_);
  }
  return *this;
 }
 void EglSurface::Invalidate() {
  if (surface_ != EGL_NO_SURFACE) {
    eglDestroySurface(display_, surface_);
    surface_ = EGL_NO_SURFACE;
  }
 }
 Status CreatePbufferRGBSurface(EGLConfig config, EGLDisplay display,
                               uint32_t height, uint32_t width,
                               EglSurface* egl_surface) {
  const EGLint pbuffer_attributes[] = {EGL_WIDTH,
                                       static_cast<EGLint>(width),
                                       EGL_HEIGHT,
                                       static_cast<EGLint>(height),
                                       EGL_TEXTURE_FORMAT,
                                       EGL_TEXTURE_RGB,
                                       EGL_TEXTURE_TARGET,
                                       EGL_TEXTURE_2D,
                                       EGL_NONE};
  EGLSurface surface =
      eglCreatePbufferSurface(display, config, pbuffer_attributes);
  RETURN_IF_ERROR(GetOpenGlErrors());
  if (surface == EGL_NO_SURFACE) {
    return InternalError("No EGL error, but eglCreatePbufferSurface failed");
  }
  *egl_surface = EglSurface(surface, display);
  return OkStatus();
 }
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/gl/egl_surface.h
+++ b/tensorflow/lite/delegates/gpu/gl/egl_surface.h
@ -0,0 +1,67 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_EGL_SURFACE_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_EGL_SURFACE_H_
 #include <cstdint>
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/gl/portable_egl.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 // An RAII wrapper for EGLSurface.
 // See https://www.khronos.org/registry/EGL/sdk/docs/man/html/eglIntro.xhtml for
 // an introduction to the concepts.
 //
 // EglSurface is moveable but not copyable.
 class EglSurface {
 public:
  // Creates an invalid EglSurface.
  EglSurface() : surface_(EGL_NO_SURFACE), display_(EGL_NO_DISPLAY) {}
  EglSurface(EGLSurface surface, EGLDisplay display)
      : surface_(surface), display_(display) {}
  // Move-only
  EglSurface(EglSurface&& other);
  EglSurface& operator=(EglSurface&& other);
  EglSurface(const EglSurface&) = delete;
  EglSurface& operator=(const EglSurface&) = delete;
  ~EglSurface() { Invalidate(); }
  EGLSurface surface() const { return surface_; }
 private:
  void Invalidate();
  EGLSurface surface_;
  EGLDisplay display_;
 };
 // Creates off-screen pbuffer-based surface of the given height and width.
 Status CreatePbufferRGBSurface(EGLConfig config, EGLDisplay display,
                               uint32_t height, uint32_t width,
                               EglSurface* egl_surface);
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_EGL_SURFACE_H_
--- a/tensorflow/lite/delegates/gpu/gl/float16_conversions.cc
+++ b/tensorflow/lite/delegates/gpu/gl/float16_conversions.cc
@ -0,0 +1,73 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/gl/float16_conversions.h"
 #include <cstdint>
 #include <vector>
 #include <fp16.h>
 #include "absl/types/variant.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 namespace {
 // Performs in-place conversion of float32 into float16
 bool ToFloat16(std::vector<uint8_t>* values) {
  if (values->size() % sizeof(float) != 0) {
    return false;
  }
  uint16_t* store_f16 = reinterpret_cast<uint16_t*>(values->data());
  const float* load_f32 = reinterpret_cast<const float*>(values->data());
  const float* end_load_f32 =
      reinterpret_cast<const float*>(values->data() + values->size());
  while (load_f32 != end_load_f32) {
    *store_f16++ = fp16_ieee_from_fp32_value(*load_f32++);
  }
  values->resize(values->size() / 2);
  return true;
 }
 struct ConverterToFloat16 {
  bool operator()(ObjectData& data) const {  // NOLINT
    return ToFloat16(&data);
  }
  bool operator()(ObjectRef& buffer) const {  // NOLINT
    return true;
  }
 };
 }  // namespace
 bool MaybeConvertToFloat16(Object* object) {
  if (object->data_type == DataType::FLOAT32 &&
      absl::visit(ConverterToFloat16(), object->object)) {
    object->data_type = DataType::FLOAT16;
    return true;
  }
  return false;
 }
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/gl/float16_conversions.h
+++ b/tensorflow/lite/delegates/gpu/gl/float16_conversions.h
@ -0,0 +1,32 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_FLOAT16_CONVERSIONS_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_FLOAT16_CONVERSIONS_H_
 #include "tensorflow/lite/delegates/gpu/gl/object.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 // If an object is float32, converts it to float16 representation.
 bool MaybeConvertToFloat16(Object* object);
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_FLOAT16_CONVERSIONS_H_
--- a/tensorflow/lite/delegates/gpu/gl/gl_buffer.cc
+++ b/tensorflow/lite/delegates/gpu/gl/gl_buffer.cc
@ -0,0 +1,89 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/gl/gl_buffer.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 Status CopyBuffer(const GlBuffer& read_buffer, const GlBuffer& write_buffer) {
  if (read_buffer.bytes_size() != write_buffer.bytes_size()) {
    return InvalidArgumentError(
        "Read buffer does not match write buffer size.");
  }
  gl_buffer_internal::BufferBinder read_buffer_binder(GL_COPY_READ_BUFFER,
                                                      read_buffer.id());
  gl_buffer_internal::BufferBinder write_buffer_binder(GL_COPY_WRITE_BUFFER,
                                                       write_buffer.id());
  return TFLITE_GPU_CALL_GL(glCopyBufferSubData, GL_COPY_READ_BUFFER,
                            GL_COPY_WRITE_BUFFER, read_buffer.offset(),
                            write_buffer.offset(), read_buffer.bytes_size());
 }
 GlBuffer::GlBuffer(GlBuffer&& buffer)
    : GlBuffer(buffer.target_, buffer.id_, buffer.bytes_size_, buffer.offset_,
               buffer.has_ownership_) {
  buffer.has_ownership_ = false;
 }
 GlBuffer& GlBuffer::operator=(GlBuffer&& buffer) {
  if (this != &buffer) {
    Invalidate();
    target_ = buffer.target_;
    bytes_size_ = buffer.bytes_size_;
    offset_ = buffer.offset_;
    has_ownership_ = buffer.has_ownership_;
    id_ = buffer.id_;
    buffer.has_ownership_ = false;
  }
  return *this;
 }
 GlBuffer::~GlBuffer() { Invalidate(); }
 void GlBuffer::Invalidate() {
  if (has_ownership_ && id_ != GL_INVALID_INDEX) {
    TFLITE_GPU_CALL_GL(glDeleteBuffers, 1, &id_).IgnoreError();
    id_ = GL_INVALID_INDEX;
  }
 }
 Status GlBuffer::BindToIndex(uint32_t index) const {
  return TFLITE_GPU_CALL_GL(glBindBufferRange, target_, index, id_, offset_,
                            bytes_size_);
 }
 Status GlBuffer::MakeView(size_t offset, size_t bytes_size,
                          GlBuffer* gl_buffer) {
  if (offset + bytes_size > bytes_size_) {
    return OutOfRangeError("GlBuffer view is out of range.");
  }
  *gl_buffer = GlBuffer(target_, id_, bytes_size, offset_ + offset,
                        /*has_ownership=*/false);
  return OkStatus();
 }
 GlBuffer GlBuffer::MakeRef() {
  return GlBuffer(target_, id_, bytes_size_, offset_,
                  /* has_ownership = */ false);
 }
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/gl/gl_buffer.h
+++ b/tensorflow/lite/delegates/gpu/gl/gl_buffer.h
@ -0,0 +1,298 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_BUFFER_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_BUFFER_H_
 #include <cstring>
 #include <functional>
 #include <vector>
 #include "absl/types/span.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_call.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_errors.h"
 #include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 // Buffer is an RAII wrapper for OpenGL buffer object.
 // See https://www.khronos.org/opengl/wiki/Buffer_Object for more information.
 //
 // Buffer is moveable but not copyable.
 class GlBuffer {
 public:
  // @param has_ownership indicates that GlBuffer is responsible for
  // corresponding GL buffer deletion.
  GlBuffer(GLenum target, GLuint id, size_t bytes_size, size_t offset,
           bool has_ownership)
      : target_(target),
        id_(id),
        bytes_size_(bytes_size),
        offset_(offset),
        has_ownership_(has_ownership) {}
  // Creates invalid buffer.
  GlBuffer() : GlBuffer(GL_INVALID_ENUM, GL_INVALID_INDEX, 0, 0, false) {}
  // Move-only
  GlBuffer(GlBuffer&& buffer);
  GlBuffer& operator=(GlBuffer&& buffer);
  GlBuffer(const GlBuffer&) = delete;
  GlBuffer& operator=(const GlBuffer&) = delete;
  ~GlBuffer();
  // Reads data from buffer into CPU memory. Data should point to a region that
  // has at least bytes_size available.
  template <typename T>
  Status Read(absl::Span<T> data) const;
  // Writes data to a buffer.
  template <typename T>
  Status Write(absl::Span<const T> data);
  // Maps GPU memory to CPU address space and calls reader that may read from
  // that memory.
  template <typename T>
  Status MappedRead(
      const std::function<Status(absl::Span<const T>)>& reader) const;
  // Maps GPU memory to CPU address space and calls writer that may write into
  // that memory.
  template <typename T>
  Status MappedWrite(const std::function<Status(absl::Span<T>)>& writer);
  Status MakeView(size_t offset, size_t bytes_size, GlBuffer* gl_buffer);
  // Makes a copy without ownership of the buffer.
  GlBuffer MakeRef();
  // Binds a buffer to an index.
  Status BindToIndex(uint32_t index) const;
  // Releases the ownership of the buffer object.
  void Release() { has_ownership_ = false; }
  size_t bytes_size() const { return bytes_size_; }
  const GLenum target() const { return target_; }
  const GLuint id() const { return id_; }
  bool is_valid() const { return id_ != GL_INVALID_INDEX; }
  size_t offset() const { return offset_; }
  // @return true if this object actually owns corresponding GL buffer
  //         and manages it's lifetime.
  bool has_ownership() const { return has_ownership_; }
 private:
  void Invalidate();
  GLenum target_;
  GLuint id_;
  size_t bytes_size_;
  size_t offset_;
  bool has_ownership_;
 };
 Status CopyBuffer(const GlBuffer& read_buffer, const GlBuffer& write_buffer);
 // Creates new shader storage buffer that will be modified and used many
 // times.
 //
 // See https://www.khronos.org/opengl/wiki/Shader_Storage_Buffer_Object for
 // details.
 template <typename T>
 Status CreateReadWriteShaderStorageBuffer(uint32_t num_elements,
                                          GlBuffer* gl_buffer);
 // Creates new shader storage buffer that will be filled with data once which
 // will be used many times.
 template <typename T>
 Status CreateReadOnlyShaderStorageBuffer(absl::Span<const T> data,
                                         GlBuffer* gl_buffer);
 // Adapts raw Buffer::Read method to read data into a vector.
 template <typename T>
 Status AppendFromBuffer(const GlBuffer& buffer, std::vector<T>* data) {
  if (buffer.bytes_size() % sizeof(T) != 0) {
    return InvalidArgumentError("Buffer is not aligned");
  }
  size_t num_elements = buffer.bytes_size() / sizeof(T);
  data->resize(data->size() + num_elements);
  return buffer.Read<T>(
      absl::MakeSpan(data->data() + data->size() - num_elements, num_elements));
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Implementation details are below.
 namespace gl_buffer_internal {
 // RAII for creating and/or owning buffer id.
 class BufferId {
 public:
  BufferId() : id_(GL_INVALID_INDEX) {
    TFLITE_GPU_CALL_GL(glGenBuffers, 1 /* number of buffers */, &id_)
        .IgnoreError();
    // only possible error here is when a number of buffers is negative.
  }
  explicit BufferId(GLuint id) : id_(id) {}
  ~BufferId() {
    if (id_ != GL_INVALID_INDEX) {
      TFLITE_GPU_CALL_GL(glDeleteBuffers, 1, &id_).IgnoreError();
    }
  }
  GLuint id() const { return id_; }
  GLuint Release() {
    GLuint id = GL_INVALID_INDEX;
    std::swap(id, id_);
    return id;
  }
 private:
  GLuint id_;
 };
 // RAII for binding and unbinding a buffer.
 class BufferBinder {
 public:
  BufferBinder(GLenum target, GLuint id) : target_(target) {
    TFLITE_GPU_CALL_GL(glBindBuffer, target_, id).IgnoreError();
  }
  ~BufferBinder() {
    TFLITE_GPU_CALL_GL(glBindBuffer, target_, 0).IgnoreError();
  }
 private:
  const GLenum target_;
 };
 // RAII for mapping and unmapping a buffer.
 class BufferMapper {
 public:
  BufferMapper(GLenum target, size_t offset, size_t bytes, GLbitfield access)
      : target_(target),
        data_(glMapBufferRange(target_, offset, bytes, access)) {}
  ~BufferMapper() { TFLITE_GPU_CALL_GL(glUnmapBuffer, target_).IgnoreError(); }
  void* data() { return data_; }
 private:
  const GLenum target_;
  void* data_;
 };
 }  // namespace gl_buffer_internal
 template <typename T>
 Status CreateReadWriteShaderStorageBuffer(uint32_t num_elements,
                                          GlBuffer* gl_buffer) {
  gl_buffer_internal::BufferId id;
  gl_buffer_internal::BufferBinder binder(GL_SHADER_STORAGE_BUFFER, id.id());
  // TODO(akulik): benchmark DYNAMIC vs STREAM buffer
  RETURN_IF_ERROR(TFLITE_GPU_CALL_GL(glBufferData, GL_SHADER_STORAGE_BUFFER,
                                     num_elements * sizeof(T), nullptr,
                                     GL_STREAM_COPY));
  *gl_buffer = GlBuffer{GL_SHADER_STORAGE_BUFFER, id.Release(),
                        num_elements * sizeof(T), 0, true};
  return OkStatus();
 }
 template <typename T>
 Status CreateReadOnlyShaderStorageBuffer(absl::Span<const T> data,
                                         GlBuffer* gl_buffer) {
  gl_buffer_internal::BufferId id;
  gl_buffer_internal::BufferBinder binder(GL_SHADER_STORAGE_BUFFER, id.id());
  RETURN_IF_ERROR(TFLITE_GPU_CALL_GL(glBufferData, GL_SHADER_STORAGE_BUFFER,
                                     data.size() * sizeof(T), data.data(),
                                     GL_STATIC_READ));
  *gl_buffer = GlBuffer{GL_SHADER_STORAGE_BUFFER, id.Release(),
                        data.size() * sizeof(T), 0, true};
  return OkStatus();
 }
 template <typename T>
 Status GlBuffer::Read(absl::Span<T> data) const {
  if (data.size() * sizeof(T) < bytes_size()) {
    return InvalidArgumentError(
        "Read from buffer failed. Destination data is shorter than buffer.");
  }
  // TODO(akulik): glCopyBufferSubData is actually available in ES 3.1, try it.
  return MappedRead<T>([this, data](absl::Span<const T> src) {
    std::memcpy(data.data(), src.data(), bytes_size());
    return OkStatus();
  });
 }
 template <typename T>
 Status GlBuffer::Write(absl::Span<const T> data) {
  if (data.size() * sizeof(T) > bytes_size_) {
    return InvalidArgumentError(
        "Write to buffer failed. Source data is larger than buffer.");
  }
  gl_buffer_internal::BufferBinder binder(target_, id_);
  return TFLITE_GPU_CALL_GL(glBufferSubData, target_, offset_, bytes_size_,
                            data.data());
 }
 template <typename T>
 Status GlBuffer::MappedRead(
    const std::function<Status(absl::Span<const T> d)>& reader) const {
  if (bytes_size_ % sizeof(T) != 0) {
    return InvalidArgumentError("Buffer is not aligned");
  }
  gl_buffer_internal::BufferBinder binder(target_, id_);
  gl_buffer_internal::BufferMapper mapper(target_, offset_, bytes_size_,
                                          GL_MAP_READ_BIT);
  if (!mapper.data()) {
    return GetOpenGlErrors();
  }
  return reader(absl::MakeSpan(reinterpret_cast<const T*>(mapper.data()),
                               bytes_size_ / sizeof(T)));
 }
 template <typename T>
 Status GlBuffer::MappedWrite(
    const std::function<Status(absl::Span<T> d)>& writer) {
  if (bytes_size_ % sizeof(T) != 0) {
    return InvalidArgumentError("Buffer is not aligned");
  }
  gl_buffer_internal::BufferBinder binder(target_, id_);
  gl_buffer_internal::BufferMapper mapper(target_, offset_, bytes_size_,
                                          GL_MAP_WRITE_BIT);
  if (!mapper.data()) {
    return GetOpenGlErrors();
  }
  return writer(absl::MakeSpan(reinterpret_cast<T*>(mapper.data()),
                               bytes_size_ / sizeof(T)));
 }
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_BUFFER_H_
--- a/tensorflow/lite/delegates/gpu/gl/gl_buffer_test.cc
+++ b/tensorflow/lite/delegates/gpu/gl/gl_buffer_test.cc
@ -0,0 +1,126 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/gl/gl_buffer.h"
 #include <memory>
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/gl/egl_environment.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 namespace {
 TEST(Buffer, Read) {
  std::unique_ptr<EglEnvironment> env;
  ASSERT_TRUE(EglEnvironment::NewEglEnvironment(&env).ok());
  std::vector<float> test = {0, 1, 2, 3};
  GlBuffer buffer;
  ASSERT_TRUE(CreateReadOnlyShaderStorageBuffer<float>(test, &buffer).ok());
  std::vector<float> from_buffer;
  ASSERT_TRUE(AppendFromBuffer(buffer, &from_buffer).ok());
  EXPECT_EQ(test, from_buffer);
 }
 TEST(Buffer, Write) {
  std::unique_ptr<EglEnvironment> env;
  ASSERT_TRUE(EglEnvironment::NewEglEnvironment(&env).ok());
  GlBuffer buffer;
  ASSERT_TRUE(CreateReadWriteShaderStorageBuffer<float>(4, &buffer).ok());
  std::vector<float> test = {0, 1, 2, 3};
  ASSERT_TRUE(buffer.Write<float>(test).ok());
  std::vector<float> from_buffer;
  ASSERT_TRUE(AppendFromBuffer(buffer, &from_buffer).ok());
  EXPECT_EQ(test, from_buffer);
 }
 TEST(Buffer, View) {
  std::unique_ptr<EglEnvironment> env;
  ASSERT_TRUE(EglEnvironment::NewEglEnvironment(&env).ok());
  GlBuffer buffer;
  ASSERT_TRUE(CreateReadWriteShaderStorageBuffer<float>(6, &buffer).ok());
  EXPECT_TRUE(buffer.has_ownership());
  EXPECT_EQ(24, buffer.bytes_size());
  EXPECT_EQ(0, buffer.offset());
  // Create view and write data there.
  GlBuffer* buffer1_ptr = nullptr;
  ASSERT_TRUE(buffer.MakeView(4, 16, buffer1_ptr).ok());
  EXPECT_FALSE(buffer1_ptr->has_ownership());
  EXPECT_EQ(16, buffer1_ptr->bytes_size());
  EXPECT_EQ(4, buffer1_ptr->offset());
  std::vector<float> test = {1, 2, 3, 4};
  ASSERT_TRUE(buffer1_ptr->Write<float>(test).ok());
  // Check that data indeed landed in a buffer with proper offset.
  std::vector<float> from_buffer;
  ASSERT_TRUE(AppendFromBuffer(buffer, &from_buffer).ok());
  EXPECT_THAT(from_buffer, testing::ElementsAre(0, 1, 2, 3, 4, 0));
  std::vector<float> from_view;
  ASSERT_TRUE(AppendFromBuffer(*buffer1_ptr, &from_view).ok());
  EXPECT_THAT(from_view, testing::ElementsAre(1, 2, 3, 4));
 }
 TEST(Buffer, SubView) {
  std::unique_ptr<EglEnvironment> env;
  ASSERT_TRUE(EglEnvironment::NewEglEnvironment(&env).ok());
  GlBuffer buffer;
  ASSERT_TRUE(CreateReadWriteShaderStorageBuffer<float>(6, &buffer).ok());
  // Create view and another view over that view.
  GlBuffer* buffer1_ptr = nullptr;
  ASSERT_TRUE(buffer.MakeView(4, 16, buffer1_ptr).ok());
  GlBuffer* buffer2_ptr = nullptr;
  EXPECT_NE(buffer1_ptr->MakeView(1, 16, buffer2_ptr), OkStatus());
  ASSERT_TRUE(buffer1_ptr->MakeView(2, 2, buffer2_ptr).ok());
  EXPECT_FALSE(buffer2_ptr->has_ownership());
  EXPECT_EQ(2, buffer2_ptr->bytes_size());
  EXPECT_EQ(6, buffer2_ptr->offset());
 }
 TEST(Buffer, Copy) {
  std::unique_ptr<EglEnvironment> env;
  ASSERT_TRUE(EglEnvironment::NewEglEnvironment(&env).ok());
  GlBuffer buffer;
  ASSERT_TRUE(CreateReadWriteShaderStorageBuffer<float>(4, &buffer).ok());
  // Create view and write data there.
  GlBuffer* buffer1_ptr = nullptr;
  ASSERT_TRUE(buffer.MakeView(4, 4, buffer1_ptr).ok());
  GlBuffer* buffer2_ptr = nullptr;
  ASSERT_TRUE(buffer.MakeView(8, 4, buffer2_ptr).ok());
  // Copy data from one view to another
  ASSERT_TRUE(buffer1_ptr->Write<float>({1}).ok());
  ASSERT_TRUE(CopyBuffer(*buffer1_ptr, *buffer2_ptr).ok());
  // Check that data indeed landed correctly.
  std::vector<float> from_buffer;
  ASSERT_TRUE(AppendFromBuffer(buffer, &from_buffer).ok());
  EXPECT_THAT(from_buffer, testing::ElementsAre(0, 1, 1, 0));
 }
 }  // namespace
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/gl/gl_call.h
+++ b/tensorflow/lite/delegates/gpu/gl/gl_call.h
@ -0,0 +1,115 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_CALL_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_CALL_H_
 #include <string>
 #include <type_traits>
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_errors.h"
 namespace tflite {
 namespace gpu {
 namespace gl {
 // Primary purpose of this file is to provide useful macro for calling GL
 // functions and checking errors. It also attaches a context to status in case
 // of a GL error.
 //
 // Use TFLITE_GPU_CALL_GL as follows:
 //
 //   For GL functions with a return value:
 //     Before:
 //       GLint result = glFunc(...);
 //       RETURN_IF_ERROR(GetOpenGlErrors());
 //     After:
 //       GLint result;
 //       RETURN_IF_ERROR(TFLITE_GPU_CALL_GL(glFunc, &result, ...));
 //
 //   For GL functions without a return value:
 //     Before:
 //       glFunc(...);
 //       RETURN_IF_ERROR(GetOpenGlErrors());
 //     After:
 //       RETURN_IF_ERROR(TFLITE_GPU_CALL_GL(glFunc, ...));
 namespace gl_call_internal {
 // For GL functions with a return value.
 template <typename T>
 struct Caller {
  template <typename F, typename ErrorF, typename... Params>
  Status operator()(const std::string& context, F func, ErrorF error_func,
                    T* result, Params&&... params) {
    *result = func(std::forward<Params>(params)...);
    const auto status = error_func();
    if (status.ok()) return OkStatus();
    return Status(status.code(), status.error_message() + ": " + context);
  }
 };
 // For GL functions without a return value.
 template<>
 struct Caller<void> {
  template <typename F, typename ErrorF, typename... Params>
  Status operator()(const std::string& context, F func, ErrorF error_func,
                    Params&&... params) {
    func(std::forward<Params>(params)...);
    const auto status = error_func();
    if (status.ok()) return OkStatus();
    return Status(status.code(), status.error_message() + ": " + context);
  }
 };
 template <typename F, typename ErrorF, typename ResultT, typename... ParamsT>
 Status CallAndCheckError(const std::string& context, F func, ErrorF error_func,
                         ResultT* result, ParamsT&&... params) {
  return Caller<ResultT>()(context, func, error_func, result,
                           std::forward<ParamsT>(params)...);
 }
 template <typename F, typename ErrorF, typename... Params>
 Status CallAndCheckError(const std::string& context, F func, ErrorF error_func,
                         Params&&... params) {
  return Caller<void>()(context, func, error_func,
                        std::forward<Params>(params)...);
 }
 }  // namespace gl_call_internal
 // XX_STRINGIFY is a helper macro to effectively apply # operator to an
 // arbitrary value.
 #define TFLITE_GPU_INTERNAL_STRINGIFY_HELPER(x) #x
 #define TFLITE_GPU_INTERNAL_STRINGIFY(x) TFLITE_GPU_INTERNAL_STRINGIFY_HELPER(x)
 #define TFLITE_GPU_FILE_LINE \
  __FILE__ ":" TFLITE_GPU_INTERNAL_STRINGIFY(__LINE__)
 #define TFLITE_GPU_CALL_GL(method, ...)                   \
  ::tflite::gpu::gl::gl_call_internal::CallAndCheckError( \
      #method " in " TFLITE_GPU_FILE_LINE, method,        \
      ::tflite::gpu::gl::GetOpenGlErrors, __VA_ARGS__)
 #define TFLITE_GPU_CALL_EGL(method, ...)                  \
  ::tflite::gpu::gl::gl_call_internal::CallAndCheckError( \
      #method " in " TFLITE_GPU_FILE_LINE, method,        \
      ::tflite::gpu::gl::GetEglError, __VA_ARGS__)
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_CALL_H_
--- a/Show More
+++ b/Show More