Implemented: GlPersistentBuffer, GlShaderSync

PiperOrigin-RevId: 249544610
2019-05-22 16:27:51 -07:00 · 2019-05-22 16:27:51 -07:00 · 87ae6fb8ee
commit 87ae6fb8ee
parent c582d56010
5 changed files with 138 additions and 2 deletions
--- a/tensorflow/lite/delegates/gpu/gl/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/BUILD
@ -260,8 +260,10 @@ cc_library(
    srcs = ["gl_sync.cc"],
    hdrs = ["gl_sync.h"],
    deps = [
+        ":gl_buffer",
        ":gl_call",
        ":gl_errors",
+        ":gl_program",
        ":portable",
        "//tensorflow/lite/delegates/gpu/common:status",
    ],
--- a/tensorflow/lite/delegates/gpu/gl/gl_buffer.cc
+++ b/tensorflow/lite/delegates/gpu/gl/gl_buffer.cc
@ -84,6 +84,54 @@ GlBuffer GlBuffer::MakeRef() {
                  /* has_ownership = */ false);
 }

+GlPersistentBuffer::GlPersistentBuffer(GLenum target, GLuint id,
+                                       size_t bytes_size, size_t offset,
+                                       bool has_ownership, void* data)
+    : GlBuffer(target, id, bytes_size, offset, has_ownership), data_(data) {}
+
+GlPersistentBuffer::GlPersistentBuffer()
+    : GlPersistentBuffer(GL_INVALID_ENUM, GL_INVALID_INDEX, 0, 0, false,
+                         nullptr) {}
+
+GlPersistentBuffer::GlPersistentBuffer(GlPersistentBuffer&& buffer)
+    : GlBuffer(std::move(buffer)), data_(buffer.data_) {}
+
+GlPersistentBuffer& GlPersistentBuffer::operator=(GlPersistentBuffer&& buffer) {
+  if (this != &buffer) {
+    data_ = buffer.data_;
+    GlBuffer::operator=(std::move(buffer));
+  }
+  return *this;
+}
+
+GlPersistentBuffer::~GlPersistentBuffer() {
+  if (!data_) return;
+  gl_buffer_internal::BufferBinder binder(GL_SHADER_STORAGE_BUFFER, id());
+  glUnmapBuffer(GL_SHADER_STORAGE_BUFFER);
+}
+
+Status CreatePersistentBuffer(size_t size, GlPersistentBuffer* gl_buffer) {
+  PFNGLBUFFERSTORAGEEXTPROC glBufferStorageEXT = nullptr;
+  glBufferStorageEXT = reinterpret_cast<PFNGLBUFFERSTORAGEEXTPROC>(
+      eglGetProcAddress("glBufferStorageEXT"));
+  if (!glBufferStorageEXT) {
+    return UnavailableError("glBufferStorageEXT is not supported");
+  }
+  gl_buffer_internal::BufferId id;
+  gl_buffer_internal::BufferBinder binder(GL_SHADER_STORAGE_BUFFER, id.id());
+  RETURN_IF_ERROR(TFLITE_GPU_CALL_GL(
+      glBufferStorageEXT, GL_SHADER_STORAGE_BUFFER, size, nullptr,
+      GL_MAP_COHERENT_BIT_EXT | GL_MAP_READ_BIT | GL_MAP_WRITE_BIT |
+          GL_MAP_PERSISTENT_BIT_EXT));
+  void* data = nullptr;
+  RETURN_IF_ERROR(TFLITE_GPU_CALL_GL(
+      glMapBufferRange, &data, GL_SHADER_STORAGE_BUFFER, 0, size,
+      GL_MAP_READ_BIT | GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT_EXT));
+  *gl_buffer = GlPersistentBuffer{
+      GL_SHADER_STORAGE_BUFFER, id.Release(), size, 0, true, data};
+  return OkStatus();
+}
+
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/gl/gl_buffer.h
+++ b/tensorflow/lite/delegates/gpu/gl/gl_buffer.h
@ -141,6 +141,32 @@ Status AppendFromBuffer(const GlBuffer& buffer, std::vector<T>* data) {
      absl::MakeSpan(data->data() + data->size() - num_elements, num_elements));
 }

+// Persistent buffer provides CPU pointer to the buffer that is valid all the
+// time. A user should properly synchronize the access to the buffer on CPU and
+// GPU sides.
+class GlPersistentBuffer : public GlBuffer {
+ public:
+  GlPersistentBuffer(GLenum target, GLuint id, size_t bytes_size, size_t offset,
+                     bool has_ownership, void* data);
+  GlPersistentBuffer();
+
+  // Move-only
+  GlPersistentBuffer(GlPersistentBuffer&& buffer);
+  GlPersistentBuffer& operator=(GlPersistentBuffer&& buffer);
+  GlPersistentBuffer(const GlPersistentBuffer&) = delete;
+  GlPersistentBuffer& operator=(const GlPersistentBuffer&) = delete;
+
+  ~GlPersistentBuffer();
+
+  void* data() { return data_; }
+
+ private:
+  void* data_;
+};
+
+// Creates read-write persistent buffer with valid CPU pointer
+Status CreatePersistentBuffer(size_t size, GlPersistentBuffer* gl_buffer);
+
 ////////////////////////////////////////////////////////////////////////////////
 // Implementation details are below.

--- a/tensorflow/lite/delegates/gpu/gl/gl_sync.cc
+++ b/tensorflow/lite/delegates/gpu/gl/gl_sync.cc
@ -78,6 +78,46 @@ Status GlActiveSyncWait() {
  }
 }

+Status GlShaderSync::NewSync(GlShaderSync* gl_sync) {
+  GlShaderSync sync;
+  RETURN_IF_ERROR(CreatePersistentBuffer(sizeof(int), &sync.flag_buffer_));
+  static const std::string* kCode = new std::string(R"(#version 310 es
+  layout(local_size_x = 1, local_size_y = 1) in;
+  layout(std430) buffer;
+  layout(binding = 0) buffer Output {
+    int elements[];
+  } output_data;
+  void main() {
+    output_data.elements[0] = 1;
+  })");
+  GlShader shader;
+  RETURN_IF_ERROR(GlShader::CompileShader(GL_COMPUTE_SHADER, *kCode, &shader));
+  RETURN_IF_ERROR(GlProgram::CreateWithShader(shader, &sync.flag_program_));
+  *gl_sync = std::move(sync);
+  return OkStatus();
+}
+
+// How it works: GPU writes a buffer and CPU checks the buffer value to be
+// changed. The buffer is accessible for writing by GPU and reading by CPU
+// simultaneously - persistent buffer or buffer across shild context can be used
+// for that.
+Status GlShaderSync::Wait() {
+  if (!flag_buffer_.is_valid()) {
+    return UnavailableError("GlShaderSync is not initialized.");
+  }
+  RETURN_IF_ERROR(flag_buffer_.BindToIndex(0));
+  volatile int* flag_ptr_ = reinterpret_cast<int*>(flag_buffer_.data());
+  *flag_ptr_ = 0;
+  RETURN_IF_ERROR(flag_program_.Dispatch({1, 1, 1}));
+  // glFlush must be called to upload GPU task. Adreno won't start executing
+  // the task without glFlush.
+  glFlush();
+  // Wait for the value is being updated by the shader.
+  while (*flag_ptr_ != 1) {
+  }
+  return OkStatus();
+}
+
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/gl/gl_sync.h
+++ b/tensorflow/lite/delegates/gpu/gl/gl_sync.h
@ -17,7 +17,9 @@ limitations under the License.
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_SYNC_H_

 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_buffer.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_call.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_program.h"
 #include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"

 namespace tflite {
@ -75,10 +77,28 @@ class GlSync {
 // Waits until GPU is done with processing.
 Status GlSyncWait();

-// Performs active waiting by spinning a thread and checking sync status. It
-// leads to shorter wait time (up to tens of ms) but consumes more CPU.
+// Waits until all comands are flushed and then performs active waiting by
+// spinning a thread and checking sync status. It leads to shorter wait time
+// (up to tens of ms) but consumes more CPU.
 Status GlActiveSyncWait();

+// Performs the best available minimum latency finish. A calling thread is not
+// going to sleep keeping active busy wait.
+// 1) CPU checks the value in the buffer that is going to be written by GPU. The
+//    persistent buffer is used if the extension is available.
+// 2) glSync is checked for the signalling state in a loop.
+// 3) glFinish() is performed if all other methods are not available
+class GlShaderSync {
+ public:
+  static Status NewSync(GlShaderSync* gl_sync);
+  GlShaderSync() {}
+  Status Wait();
+
+ private:
+  GlProgram flag_program_;
+  GlPersistentBuffer flag_buffer_;
+};
+
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite