Define for building OpenCL only delegate.

By default OpenCL delegate supports GL interop and links GL/EGL libraries. With define we can build delegate without dependencies on GL libs. PiperOrigin-RevId: 322210374 Change-Id: Id02c53747873a474c31971553048c73d3506f4ae
2020-07-20 13:14:45 -07:00 · 2020-07-20 13:14:45 -07:00 · 6ee2d328fd
commit 6ee2d328fd
parent ecaf86d96c
9 changed files with 301 additions and 23 deletions
--- a/tensorflow/lite/delegates/gpu/BUILD
+++ b/tensorflow/lite/delegates/gpu/BUILD
@ -234,7 +234,14 @@ cc_library(
        ],
        "//conditions:default": [],
    }),
-    deps = [
+    deps = select({
        "//tensorflow/lite/delegates/gpu/cl:opencl_delegate_no_gl": [],
        "//conditions:default": [
            "//tensorflow/lite/delegates/gpu/gl:api2",
        ],
    }) + [
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/types:span",
        "//tensorflow/lite:kernel_api",
        "//tensorflow/lite:minimal_logging",
        "//tensorflow/lite/c:common",
@ -247,9 +254,6 @@ cc_library(
        "//tensorflow/lite/delegates/gpu/common:model_transformer",
        "//tensorflow/lite/delegates/gpu/common:quantization_util",
        "//tensorflow/lite/delegates/gpu/common:status",
        "//tensorflow/lite/delegates/gpu/gl:api2",
        "//tensorflow/lite/kernels/internal:optimized_base",
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/types:span",
    ],
 )
--- a/tensorflow/lite/delegates/gpu/api.h
+++ b/tensorflow/lite/delegates/gpu/api.h
@ -43,9 +43,14 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/util.h"
 #include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
 #include <vulkan/vulkan.h>
 #define GL_NO_PROTOTYPES
 #define EGL_NO_PROTOTYPES
 #include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
 #undef GL_NO_PROTOTYPES
 #undef EGL_NO_PROTOTYPES
 namespace tflite {
 namespace gpu {
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@ -9,23 +9,34 @@ package(
    licenses = ["notice"],  # Apache 2.0
 )
 config_setting(
    name = "opencl_delegate_no_gl",
    values = {"copt": "-DCL_DELEGATE_NO_GL"},
 )
 cc_library(
    name = "api",
    srcs = ["api.cc"],
    hdrs = ["api.h"],
-    deps = [
+    deps = select({
        ":opencl_delegate_no_gl": [],
        "//conditions:default": [
            ":egl_sync",
            ":gl_interop",
        ],
    }) + [
        ":cl_command_queue",
        ":cl_errors",
        ":cl_event",
        ":egl_sync",
        ":environment",
        ":gl_interop",
        ":inference_context",
        ":opencl_wrapper",
        ":precision",
        ":tensor",
        ":tensor_type",
        ":tensor_type_util",
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/types:span",
        "//tensorflow/lite/delegates/gpu:api",
        "//tensorflow/lite/delegates/gpu/cl/kernels:converter",
        "//tensorflow/lite/delegates/gpu/common:data_type",
@ -33,8 +44,6 @@ cc_library(
        "//tensorflow/lite/delegates/gpu/common:shape",
        "//tensorflow/lite/delegates/gpu/common:status",
        "//tensorflow/lite/delegates/gpu/common:tensor",
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/types:span",
    ],
 )
--- a/tensorflow/lite/delegates/gpu/cl/api.cc
+++ b/tensorflow/lite/delegates/gpu/cl/api.cc
@ -15,7 +15,9 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/api.h"
-#include <EGL/eglext.h>
+#ifndef CL_DELEGATE_NO_GL
 #define CL_DELEGATE_ALLOW_GL
 #endif
 #include <algorithm>
 #include <cstring>
@ -25,9 +27,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_errors.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
 #include "tensorflow/lite/delegates/gpu/cl/egl_sync.h"
 #include "tensorflow/lite/delegates/gpu/cl/environment.h"
 #include "tensorflow/lite/delegates/gpu/cl/gl_interop.h"
 #include "tensorflow/lite/delegates/gpu/cl/inference_context.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/converter.h"
 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
@ -39,6 +39,13 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #ifdef CL_DELEGATE_ALLOW_GL
 #include <EGL/eglext.h>
 #include "tensorflow/lite/delegates/gpu/cl/egl_sync.h"
 #include "tensorflow/lite/delegates/gpu/cl/gl_interop.h"
 #endif
 namespace tflite {
 namespace gpu {
 namespace cl {
@ -87,11 +94,13 @@ class DefaultTensorTie : public TensorTie {
      const TensorTieDef& def,
      const TensorObjectConverterBuilder& converter_builder) {
    auto object_type = def.external_def.object_def.object_type;
 #ifdef CL_DELEGATE_ALLOW_GL
    if (def.external_def.object_def.user_provided &&
        GlClBufferCopier::IsSupported(def.external_def.object_def,
                                      def.internal_def.object_def)) {
      return true;
    }
 #endif
    return (object_type == ObjectType::OPENCL_BUFFER ||
            object_type == ObjectType::OPENCL_TEXTURE ||
            object_type == ObjectType::CPU_MEMORY) &&
@ -138,6 +147,7 @@ class DefaultTensorTie : public TensorTie {
 private:
  absl::Status Init(TensorObjectConverterBuilder* converter_builder,
                    Environment* env) {
 #ifdef CL_DELEGATE_ALLOW_GL
    if (def().external_def.object_def.user_provided &&
        GlClBufferCopier::IsSupported(def().external_def.object_def,
                                      def().internal_def.object_def)) {
@ -156,6 +166,12 @@ class DefaultTensorTie : public TensorTie {
      RETURN_IF_ERROR(converter_builder->MakeConverter(
          def().internal_def, def().external_def, &converter_to_));
    }
 #else
    RETURN_IF_ERROR(converter_builder->MakeConverter(
        def().external_def, def().internal_def, &converter_from_));
    RETURN_IF_ERROR(converter_builder->MakeConverter(
        def().internal_def, def().external_def, &converter_to_));
 #endif
    return MaybeAllocateExternalObject(env);
  }
@ -275,6 +291,7 @@ class TwoStepTensorTie : public TensorTie {
  std::unique_ptr<TensorTie> outer_tie_;
 };
 #ifdef CL_DELEGATE_ALLOW_GL
 // Captures GL object into CL context before performing a conversion.
 class GlBufferHolder : public TensorTie {
 public:
@ -351,6 +368,7 @@ class GlBufferHolder : public TensorTie {
  std::unique_ptr<TensorTie> tie_;
  TensorObject external_obj_;
 };
 #endif
 TensorObject TensorToObj(const Tensor& tensor) {
  if (tensor.GetStorageType() == TensorStorageType::BUFFER) {
@ -365,19 +383,28 @@ TensorObject TensorToObj(const Tensor& tensor) {
 // Responsible for creating new tensor objects.
 class TensorTieFactory {
 public:
-  TensorTieFactory(Environment* env, InferenceContext* context,
+  TensorTieFactory(Environment* env, InferenceContext* context
-                   GlInteropFabric* gl_interop_fabric)
+#ifdef CL_DELEGATE_ALLOW_GL
                   ,
                   GlInteropFabric* gl_interop_fabric
 #endif
                   )
      : env_(*env),
        context_(*context),
 #ifdef CL_DELEGATE_ALLOW_GL
        gl_interop_fabric_(gl_interop_fabric),
-        converter_builder_(NewConverterBuilder(env)) {}
+#endif
        converter_builder_(NewConverterBuilder(env)) {
  }
  bool IsSupported(const TensorTieDef& def) const {
    return IsValid(def.external_def.object_def) &&
           (NoopTensorTie::IsSupported(def) ||
            DefaultTensorTie::IsSupported(def, *converter_builder_) ||
 #ifdef CL_DELEGATE_ALLOW_GL
            (gl_interop_fabric_ &&
             GlBufferHolder::IsSupported(def, *converter_builder_)) ||
 #endif
            TwoStepTensorTie::IsSupported(def, *converter_builder_));
  }
@ -392,10 +419,12 @@ class TensorTieFactory {
    if (DefaultTensorTie::IsSupported(def, *converter)) {
      return DefaultTensorTie::New(def, internal_object, converter, &env_, tie);
    }
 #ifdef CL_DELEGATE_ALLOW_GL
    if (gl_interop_fabric_ && GlBufferHolder::IsSupported(def, *converter)) {
      return GlBufferHolder::New(def, internal_object, converter,
                                 gl_interop_fabric_, &env_, tie);
    }
 #endif
    if (TwoStepTensorTie::IsSupported(def, *converter)) {
      return TwoStepTensorTie::New(def, internal_object, converter, &env_, tie);
    }
@ -405,18 +434,29 @@ class TensorTieFactory {
 private:
  Environment& env_;
  InferenceContext& context_;
 #ifdef CL_DELEGATE_ALLOW_GL
  GlInteropFabric* gl_interop_fabric_;
 #endif
  std::unique_ptr<TensorObjectConverterBuilder> converter_builder_;
 };
 class InferenceRunnerImpl : public InferenceRunner {
 public:
  InferenceRunnerImpl(Environment* environment,
-                      std::unique_ptr<InferenceContext> context,
+                      std::unique_ptr<InferenceContext> context
-                      std::unique_ptr<GlInteropFabric> gl_interop_fabric)
+#ifdef CL_DELEGATE_ALLOW_GL
                      ,
                      std::unique_ptr<GlInteropFabric> gl_interop_fabric
 #endif
                      )
      : queue_(environment->queue()),
-        context_(std::move(context)),
+        context_(std::move(context))
-        gl_interop_fabric_(std::move(gl_interop_fabric)) {}
+#ifdef CL_DELEGATE_ALLOW_GL
        ,
        gl_interop_fabric_(std::move(gl_interop_fabric))
 #endif
  {
  }
  absl::Status Initialize(const std::vector<TensorTieDef>& inputs,
                          const std::vector<TensorTieDef>& outputs,
@ -464,9 +504,11 @@ class InferenceRunnerImpl : public InferenceRunner {
  }
  absl::Status Run() override {
 #ifdef CL_DELEGATE_ALLOW_GL
    if (gl_interop_fabric_) {
      RETURN_IF_ERROR(gl_interop_fabric_->Start());
    }
 #endif
    for (auto& obj : inputs_) {
      RETURN_IF_ERROR(obj->CopyFromExternalObject());
    }
@ -475,9 +517,11 @@ class InferenceRunnerImpl : public InferenceRunner {
    for (auto& obj : outputs_) {
      RETURN_IF_ERROR(obj->CopyToExternalObject());
    }
 #ifdef CL_DELEGATE_ALLOW_GL
    if (gl_interop_fabric_) {
      RETURN_IF_ERROR(gl_interop_fabric_->Finish());
    }
 #endif
    return absl::OkStatus();
  }
@ -506,7 +550,9 @@ class InferenceRunnerImpl : public InferenceRunner {
  CLCommandQueue* queue_;
  std::unique_ptr<InferenceContext> context_;
 #ifdef CL_DELEGATE_ALLOW_GL
  std::unique_ptr<GlInteropFabric> gl_interop_fabric_;
 #endif
  std::vector<std::unique_ptr<TensorTie>> inputs_;
  std::vector<std::unique_ptr<TensorTie>> outputs_;
 };
@ -542,6 +588,7 @@ class InferenceBuilderImpl : public InferenceBuilder {
    }
    RETURN_IF_ERROR(context_->InitFromGraph(create_info, graph, environment_));
 #ifdef CL_DELEGATE_ALLOW_GL
    if (env_options.IsGlAware() &&
        IsGlSharingSupported(environment_->device())) {
      gl_interop_fabric_ = absl::make_unique<GlInteropFabric>(
@ -549,6 +596,10 @@ class InferenceBuilderImpl : public InferenceBuilder {
    }
    tie_factory_ = absl::make_unique<TensorTieFactory>(
        environment_, context_.get(), gl_interop_fabric_.get());
 #else
    tie_factory_ =
        absl::make_unique<TensorTieFactory>(environment_, context_.get());
 #endif
    inputs_ = LinkTensors(graph, graph.inputs());
    outputs_ = LinkTensors(graph, graph.outputs());
@ -599,6 +650,7 @@ class InferenceBuilderImpl : public InferenceBuilder {
  }
  absl::Status Build(std::unique_ptr<InferenceRunner>* runner) override {
 #ifdef CL_DELEGATE_ALLOW_GL
    if (gl_interop_fabric_ && !HasGlObjects()) {
      // destroy interop layer when there are no GL objects to avoid
      // extra synchronization cost.
@ -606,6 +658,10 @@ class InferenceBuilderImpl : public InferenceBuilder {
    }
    auto runner_impl = absl::make_unique<InferenceRunnerImpl>(
        environment_, std::move(context_), std::move(gl_interop_fabric_));
 #else
    auto runner_impl = absl::make_unique<InferenceRunnerImpl>(
        environment_, std::move(context_));
 #endif
    RETURN_IF_ERROR(
        runner_impl->Initialize(inputs_, outputs_, tie_factory_.get()));
    *runner = std::move(runner_impl);
@ -676,6 +732,7 @@ class InferenceBuilderImpl : public InferenceBuilder {
  }
  bool HasGlObjects() const {
 #ifdef CL_DELEGATE_ALLOW_GL
    auto is_gl = [](ObjectType t) {
      return t == ObjectType::OPENGL_SSBO || t == ObjectType::OPENGL_TEXTURE;
    };
@ -689,6 +746,7 @@ class InferenceBuilderImpl : public InferenceBuilder {
        return true;
      }
    }
 #endif
    return false;
  }
@ -703,7 +761,9 @@ class InferenceBuilderImpl : public InferenceBuilder {
  }
  std::unique_ptr<InferenceContext> context_;
 #ifdef CL_DELEGATE_ALLOW_GL
  std::unique_ptr<GlInteropFabric> gl_interop_fabric_;
 #endif
  Environment* environment_;
  std::vector<TensorTieDef> inputs_;
@ -730,20 +790,25 @@ class InferenceEnvironmentImpl : public InferenceEnvironment {
      RETURN_IF_ERROR(CreateDefaultGPUDevice(&device));
    }
 #ifdef CL_DELEGATE_ALLOW_GL
    properties_.is_gl_sharing_supported = IsGlSharingSupported(device);
    properties_.is_gl_to_cl_fast_sync_supported =
        IsClEventFromEglSyncSupported(device);
    properties_.is_cl_to_gl_fast_sync_supported =
        IsEglSyncFromClEventSupported();
 #endif
    CLContext context;
    if (options_.context) {
 #ifdef CL_DELEGATE_ALLOW_GL
      if (options_.IsGlAware()) {
        return absl::InvalidArgumentError(
            "OpenCL context and EGL parameters are set in the same time.");
      }
 #endif
      context = CLContext(options_.context, /* has_ownership = */ false);
    } else {
 #ifdef CL_DELEGATE_ALLOW_GL
      if (options_.IsGlAware() && properties_.is_gl_sharing_supported) {
        RETURN_IF_ERROR(CreateCLGLContext(
            device,
@ -753,6 +818,9 @@ class InferenceEnvironmentImpl : public InferenceEnvironment {
      } else {
        RETURN_IF_ERROR(CreateCLContext(device, &context));
      }
 #else
      RETURN_IF_ERROR(CreateCLContext(device, &context));
 #endif
    }
    CLCommandQueue queue;
--- a/tensorflow/lite/delegates/gpu/cl/api.h
+++ b/tensorflow/lite/delegates/gpu/cl/api.h
@ -16,6 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_API_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_API_H_
 #ifdef CL_DELEGATE_NO_GL
 #define EGL_NO_PROTOTYPES
 #endif
 #include <EGL/egl.h>
 #include <cstdint>
--- a/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.h
+++ b/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.h
@ -16,8 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_API_DELEGATE_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_API_DELEGATE_H_
 #define GL_NO_PROTOTYPES
 #define EGL_NO_PROTOTYPES
 #include <EGL/egl.h>
 #include <GLES3/gl31.h>
 #undef GL_NO_PROTOTYPES
 #undef EGL_NO_PROTOTYPES
 #include <stdint.h>
 #include "tensorflow/lite/c/common.h"
@ -76,8 +81,8 @@ typedef struct {
 // .compile_options = {
 //   .precision_loss_allowed = false,
 // }
-// .egl_display = eglGetCurrentDisplay(),
+// .egl_display = EGL_NO_DISPLAY;
-// .egl_context = eglGetCurrentContext();
+// .egl_context = EGL_NO_CONTEXT;
 TFL_CAPI_EXPORT TfLiteDelegate* TfLiteGpuDelegateCreate_New(
    const TfLiteGpuDelegateOptions_New* options);
--- a/tensorflow/lite/delegates/gpu/cl/testing/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/testing/BUILD
@ -16,3 +16,21 @@ cc_binary(
        "@com_google_absl//absl/time",
    ],
 )
 cc_binary(
    name = "delegate_testing",
    srcs = ["delegate_testing.cc"],
    tags = [
        "nobuilder",
        "notap",
    ],
    deps = [
        "//tensorflow/lite/delegates/gpu:delegate",
        "//tensorflow/lite/delegates/gpu/cl:gpu_api_delegate",
        "//tensorflow/lite/delegates/gpu/common:status",
        "//tensorflow/lite/delegates/gpu/common/testing:tflite_model_reader",
        "//tensorflow/lite/kernels:builtin_ops",
        "//tensorflow/lite/kernels:kernel_util",
        "@com_google_absl//absl/time",
    ],
 )
--- a/tensorflow/lite/delegates/gpu/cl/testing/delegate_testing.cc
+++ b/tensorflow/lite/delegates/gpu/cl/testing/delegate_testing.cc
@ -0,0 +1,158 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <algorithm>
 #include <chrono>  // NOLINT(build/c++11)
 #include <cmath>
 #include <cstdlib>
 #include <iostream>
 #include <memory>
 #include <string>
 #include "absl/time/time.h"
 #include "tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/testing/tflite_model_reader.h"
 #include "tensorflow/lite/delegates/gpu/delegate.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/register.h"
 namespace {
 void FillInputTensor(tflite::Interpreter* interpreter) {
  for (int k = 0; k < interpreter->inputs().size(); ++k) {
    float* p = interpreter->typed_input_tensor<float>(k);
    const auto n =
        tflite::NumElements(interpreter->tensor(interpreter->inputs()[k]));
    for (int i = 0; i < n; ++i) {
      p[i] = std::sin(i);
    }
  }
 }
 void CompareCPUGPUResults(tflite::Interpreter* cpu, tflite::Interpreter* gpu,
                          float eps) {
  for (int i = 0; i < cpu->outputs().size(); ++i) {
    const float* cpu_out = cpu->typed_output_tensor<float>(i);
    const float* gpu_out = gpu->typed_output_tensor<float>(i);
    auto out_n = tflite::NumElements(cpu->tensor(cpu->outputs()[i]));
    const int kMaxPrint = 10;
    int printed = 0;
    int total_different = 0;
    for (int k = 0; k < out_n; ++k) {
      const float abs_diff = fabs(cpu_out[k] - gpu_out[k]);
      if (abs_diff > eps) {
        total_different++;
        if (printed < kMaxPrint) {
          std::cout << "Output #" << i << ": element #" << k << ": CPU value - "
                    << cpu_out[k] << ", GPU value - " << gpu_out[k]
                    << ", abs diff - " << abs_diff << std::endl;
          printed++;
        }
        if (printed == kMaxPrint) {
          std::cout << "Printed " << kMaxPrint
                    << " different elements, threshhold - " << eps
                    << ", next different elements skipped" << std::endl;
          printed++;
        }
      }
    }
    std::cout << "Total " << total_different
              << " different elements, for output #" << i << ", threshhold - "
              << eps << std::endl;
  }
 }
 }  // namespace
 int main(int argc, char** argv) {
  if (argc <= 1) {
    std::cerr << "Expected model path as second argument." << std::endl;
    return -1;
  }
  auto model = tflite::FlatBufferModel::BuildFromFile(argv[1]);
  if (!model) {
    std::cerr << "FlatBufferModel::BuildFromFile failed, model path - "
              << argv[1] << std::endl;
    return -1;
  }
  tflite::ops::builtin::BuiltinOpResolver op_resolver;
  tflite::InterpreterBuilder builder(*model, op_resolver);
  // CPU.
  std::unique_ptr<tflite::Interpreter> cpu_inference;
  builder(&cpu_inference);
  if (!cpu_inference) {
    std::cerr << "Failed to build CPU inference." << std::endl;
    return -1;
  }
  auto status = cpu_inference->AllocateTensors();
  if (status != kTfLiteOk) {
    std::cerr << "Failed to AllocateTensors for CPU inference." << std::endl;
    return -1;
  }
  FillInputTensor(cpu_inference.get());
  status = cpu_inference->Invoke();
  if (status != kTfLiteOk) {
    std::cerr << "Failed to Invoke CPU inference." << std::endl;
    return -1;
  }
  // GPU.
  std::unique_ptr<tflite::Interpreter> gpu_inference;
  builder(&gpu_inference);
  if (!gpu_inference) {
    std::cerr << "Failed to build GPU inference." << std::endl;
    return -1;
  }
  TfLiteGpuDelegateOptionsV2 options;
  options.is_precision_loss_allowed = -1;
  options.inference_preference =
      TFLITE_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER;
  options.inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY;
  options.inference_priority2 = TFLITE_GPU_INFERENCE_PRIORITY_MIN_MEMORY_USAGE;
  options.inference_priority3 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION;
  auto* gpu_delegate = TfLiteGpuDelegateV2Create(&options);
  status = gpu_inference->ModifyGraphWithDelegate(gpu_delegate);
  if (status != kTfLiteOk) {
    std::cerr << "ModifyGraphWithDelegate failed." << std::endl;
    return -1;
  }
  FillInputTensor(gpu_inference.get());
  status = gpu_inference->Invoke();
  if (status != kTfLiteOk) {
    std::cerr << "Failed to Invoke GPU inference." << std::endl;
    return -1;
  }
  CompareCPUGPUResults(cpu_inference.get(), gpu_inference.get(), 1e-4f);
  // CPU inference latency.
  auto start = std::chrono::high_resolution_clock::now();
  cpu_inference->Invoke();
  auto end = std::chrono::high_resolution_clock::now();
  std::cout << "CPU time - " << (end - start).count() * 1e-6f << "ms"
            << std::endl;
  // GPU inference latency.
  start = std::chrono::high_resolution_clock::now();
  gpu_inference->Invoke();
  end = std::chrono::high_resolution_clock::now();
  std::cout << "GPU time(CPU->GPU->CPU) - " << (end - start).count() * 1e-6f
            << "ms" << std::endl;
  TfLiteGpuDelegateV2Delete(gpu_delegate);
  return EXIT_SUCCESS;
 }
--- a/tensorflow/lite/delegates/gpu/delegate.cc
+++ b/tensorflow/lite/delegates/gpu/delegate.cc
@ -34,10 +34,13 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/quantization_util.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/gl/api2.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/minimal_logging.h"
 #ifndef CL_DELEGATE_NO_GL
 #include "tensorflow/lite/delegates/gpu/gl/api2.h"
 #endif
 namespace tflite {
 namespace gpu {
 namespace {
@ -315,6 +318,7 @@ class DelegateKernel {
  absl::Status InitializeOpenGlApi(GraphFloat32* graph,
                                   std::unique_ptr<InferenceBuilder>* builder) {
 #ifndef CL_DELEGATE_NO_GL
    gl::InferenceEnvironmentOptions env_options;
    gl::InferenceEnvironmentProperties properties;
    RETURN_IF_ERROR(
@ -330,13 +334,16 @@ class DelegateKernel {
    enforce_same_thread_ = true;
    TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
                         "Initialized OpenGL-based API.");
 #endif
    return absl::OkStatus();
  }
  // The Delegate instance that's shared across all DelegateKernel instances.
  Delegate* const delegate_;  // doesn't own the memory.
  std::unique_ptr<cl::InferenceEnvironment> cl_environment_;
 #ifndef CL_DELEGATE_NO_GL
  std::unique_ptr<gl::InferenceEnvironment> gl_environment_;
 #endif
  std::unique_ptr<InferenceRunner> runner_;
  std::vector<int64_t> input_indices_;
  std::vector<int64_t> output_indices_;