diff --git a/tensorflow/lite/experimental/ruy/CONTRIBUTING.md b/tensorflow/lite/experimental/ruy/CONTRIBUTING.md
deleted file mode 100644
index 654a071648d..00000000000
--- a/tensorflow/lite/experimental/ruy/CONTRIBUTING.md
+++ /dev/null
@@ -1,28 +0,0 @@
-# How to Contribute
-
-We'd love to accept your patches and contributions to this project. There are
-just a few small guidelines you need to follow.
-
-## Contributor License Agreement
-
-Contributions to this project must be accompanied by a Contributor License
-Agreement. You (or your employer) retain the copyright to your contribution;
-this simply gives us permission to use and redistribute your contributions as
-part of the project. Head over to to see
-your current agreements on file or to sign a new one.
-
-You generally only need to submit a CLA once, so if you've already submitted one
-(even if it was for a different project), you probably don't need to do it
-again.
-
-## Code reviews
-
-All submissions, including submissions by project members, require review. We
-use GitHub pull requests for this purpose. Consult
-[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
-information on using pull requests.
-
-## Community Guidelines
-
-This project follows [Google's Open Source Community
-Guidelines](https://opensource.google/conduct/).
diff --git a/tensorflow/lite/experimental/ruy/README.md b/tensorflow/lite/experimental/ruy/README.md
deleted file mode 100644
index 09b85927d09..00000000000
--- a/tensorflow/lite/experimental/ruy/README.md
+++ /dev/null
@@ -1,24 +0,0 @@
-# The ruy matrix multiplication library
-
-This is not an officially supported Google product.
-
-ruy is a matrix multiplication library. Its focus is to cover the matrix
-multiplication needs of neural network inference engines. Its initial user has
-been TensorFlow Lite, where it is used by default on the ARM CPU architecture.
-
-ruy supports both floating-point and 8bit-integer-quantized matrices.
-
-## Efficiency
-
-ruy is designed to achieve maximal performance not just on very large sizes, as
-is the focus of many established libraries, but on whatever are the actual sizes
-and shapes of matrices most critical in current TensorFlow Lite applications.
-This often means quite small sizes, e.g. 100x100 or even 50x50, and all sorts of
-rectangular shapes.
-
-ruy is currently only optimized for the ARM architectures (both 64-bit and
-32-bit code). Optimization for the Intel x86 architecture is in progress.
-
-ruy is currently optimized only for the following combination of storage orders:
-LHS = row-major, RHS = column-major, destination = column-major. All other
-combinations of storage orders fall back to slow reference code at the moment.
diff --git a/tensorflow/lite/experimental/ruy/WORKSPACE b/tensorflow/lite/experimental/ruy/WORKSPACE
deleted file mode 100644
index 8364d8047b1..00000000000
--- a/tensorflow/lite/experimental/ruy/WORKSPACE
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright 2020 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Workspace file for the Ruy project.
-
-workspace(name = "com_google_ruy")
diff --git a/tensorflow/lite/experimental/ruy/ruy/BUILD b/tensorflow/lite/experimental/ruy/ruy/BUILD
deleted file mode 100644
index c808c3ec063..00000000000
--- a/tensorflow/lite/experimental/ruy/ruy/BUILD
+++ /dev/null
@@ -1,954 +0,0 @@
-# Ruy is not BLAS
-
-load(":build_defs.bzl", "ruy_copts_avx2", "ruy_copts_avxvnni", "ruy_copts_base", "ruy_copts_skylake", "ruy_copts_sse42")
-load(":ruy_test_ext.bzl", "ruy_test_ext_defines", "ruy_test_ext_deps")
-load(":ruy_test.bzl", "ruy_benchmark", "ruy_test")
-
-package(
- default_visibility = ["//visibility:public"],
- licenses = ["notice"], # Apache 2.0
-)
-
-config_setting(
- name = "windows",
- values = {"cpu": "x64_windows"},
-)
-
-config_setting(
- name = "armeabi-v7a",
- values = {"cpu": "armeabi-v7a"},
-)
-
-config_setting(
- name = "x86_64",
- values = {"cpu": "k8"},
-)
-
-config_setting(
- name = "optimized",
- values = {
- "compilation_mode": "opt",
- },
- visibility = ["//visibility:public"],
-)
-
-cc_library(
- name = "platform",
- hdrs = ["platform.h"],
- copts = ruy_copts_base(),
-)
-
-cc_library(
- name = "check_macros",
- hdrs = ["check_macros.h"],
- copts = ruy_copts_base(),
-)
-
-cc_test(
- name = "check_macros_test",
- srcs = ["check_macros_test.cc"],
- copts = ruy_copts_base(),
- deps = [
- ":check_macros",
- "@com_google_googletest//:gtest",
- ],
-)
-
-cc_library(
- name = "opt_set",
- hdrs = ["opt_set.h"],
- copts = ruy_copts_base(),
-)
-
-cc_library(
- name = "time",
- hdrs = ["time.h"],
- copts = ruy_copts_base(),
-)
-
-cc_library(
- name = "wait",
- srcs = ["wait.cc"],
- hdrs = ["wait.h"],
- copts = ruy_copts_base(),
- deps = [":time"],
-)
-
-cc_test(
- name = "wait_test",
- srcs = ["wait_test.cc"],
- deps = [
- ":platform",
- ":wait",
- "@com_google_googletest//:gtest",
- ],
-)
-
-cc_library(
- name = "size_util",
- hdrs = ["size_util.h"],
- copts = ruy_copts_base(),
- deps = [":check_macros"],
-)
-
-cc_test(
- name = "size_util_test",
- srcs = ["size_util_test.cc"],
- deps = [
- ":size_util",
- "@com_google_googletest//:gtest",
- ],
-)
-
-cc_library(
- name = "tune",
- srcs = [
- "tune.cc",
- ],
- hdrs = [
- "tune.h",
- ],
- copts = ruy_copts_base(),
- deps = [
- ":opt_set",
- ":platform",
- ":time",
- ],
-)
-
-cc_library(
- name = "prepacked_cache",
- srcs = [
- "prepacked_cache.cc",
- ],
- hdrs = [
- "prepacked_cache.h",
- ],
- copts = ruy_copts_base(),
- deps = [
- ":allocator",
- ":matrix",
- ":opt_set",
- ":platform",
- ":time",
- "//tensorflow/lite/experimental/ruy/ruy/profiler:instrumentation",
- ],
-)
-
-cc_test(
- name = "tune_test",
- srcs = ["tune_test.cc"],
- deps = [
- ":tune",
- "@com_google_googletest//:gtest",
- ],
-)
-
-cc_test(
- name = "prepacked_cache_test",
- srcs = ["prepacked_cache_test.cc"],
- deps = [
- ":prepacked_cache",
- ":ruy",
- ":time",
- "@com_google_googletest//:gtest",
- ],
-)
-
-cc_binary(
- name = "tune_tool",
- srcs = ["tune_tool.cc"],
- deps = [
- ":tune",
- ],
-)
-
-cc_library(
- name = "allocator",
- srcs = [
- "allocator.cc",
- ],
- hdrs = [
- "allocator.h",
- ],
- copts = ruy_copts_base(),
- deps = [
- ":check_macros",
- ":size_util",
- ],
-)
-
-cc_test(
- name = "allocator_test",
- srcs = ["allocator_test.cc"],
- deps = [
- ":allocator",
- "@com_google_googletest//:gtest",
- ],
-)
-
-cc_library(
- name = "side_pair",
- hdrs = ["side_pair.h"],
- copts = ruy_copts_base(),
- deps = [":check_macros"],
-)
-
-cc_library(
- name = "block_map",
- srcs = [
- "block_map.cc",
- ],
- hdrs = [
- "block_map.h",
- ],
- copts = ruy_copts_base(),
- deps = [
- ":check_macros",
- ":opt_set",
- ":path",
- ":side_pair",
- ":size_util",
- "//tensorflow/lite/experimental/ruy/ruy/profiler:instrumentation",
- ],
-)
-
-cc_test(
- name = "block_map_test",
- srcs = ["block_map_test.cc"],
- deps = [
- ":block_map",
- ":cpu_cache_size",
- ":path",
- ":side_pair",
- "@com_google_googletest//:gtest",
- ],
-)
-
-cc_library(
- name = "blocking_counter",
- srcs = [
- "blocking_counter.cc",
- ],
- hdrs = [
- "blocking_counter.h",
- ],
- copts = ruy_copts_base(),
- deps = [
- ":check_macros",
- ":wait",
- ],
-)
-
-cc_library(
- name = "thread_pool",
- srcs = [
- "thread_pool.cc",
- ],
- hdrs = [
- "thread_pool.h",
- ],
- copts = ruy_copts_base(),
- deps = [
- ":blocking_counter",
- ":check_macros",
- ":wait",
- ],
-)
-
-cc_library(
- name = "detect_arm",
- srcs = [
- "detect_arm.cc",
- ],
- hdrs = [
- "detect_arm.h",
- ],
- copts = ruy_copts_base(),
-)
-
-cc_library(
- name = "detect_x86",
- srcs = [
- "detect_x86.cc",
- ],
- hdrs = [
- "detect_x86.h",
- ],
- copts = ruy_copts_base(),
- deps = [
- ":platform",
- ],
-)
-
-cc_library(
- name = "path",
- hdrs = ["path.h"],
- copts = ruy_copts_base(),
- deps = [
- ":platform",
- ":size_util",
- ],
-)
-
-cc_library(
- name = "cpu_cache_size",
- hdrs = ["cpu_cache_size.h"],
- copts = ruy_copts_base(),
- deps = [
- ":path",
- ":platform",
- ],
-)
-
-cc_library(
- name = "trace",
- srcs = [
- "trace.cc",
- ],
- hdrs = [
- "trace.h",
- ],
- copts = ruy_copts_base(),
- deps = [
- ":block_map",
- ":check_macros",
- ":side_pair",
- ":time",
- ],
-)
-
-cc_library(
- name = "matrix",
- hdrs = ["matrix.h"],
- copts = ruy_copts_base(),
- deps = [":check_macros"],
-)
-
-cc_library(
- name = "spec",
- hdrs = ["spec.h"],
- copts = ruy_copts_base(),
- deps = [
- ":cpu_cache_size",
- ":matrix",
- ],
-)
-
-cc_library(
- name = "internal_matrix",
- hdrs = ["internal_matrix.h"],
- copts = ruy_copts_base(),
- deps = [
- ":check_macros",
- ":common",
- ":matrix",
- ":size_util",
- ],
-)
-
-cc_library(
- name = "common",
- hdrs = [
- "common.h",
- ],
- copts = ruy_copts_base(),
- deps = [
- ":check_macros",
- ":matrix",
- ":opt_set",
- ":path",
- ":platform",
- ],
-)
-
-cc_library(
- name = "kernel_common",
- hdrs = [
- "kernel.h",
- "kernel_arm.h",
- "kernel_common.h",
- "kernel_x86.h",
- ],
- copts = ruy_copts_base(),
- deps = [
- ":check_macros",
- ":common",
- ":internal_matrix",
- ":matrix",
- ":opt_set",
- ":path",
- ":platform",
- ":side_pair",
- ":size_util",
- ":spec",
- ":tune",
- "//tensorflow/lite/experimental/ruy/ruy/profiler:instrumentation",
- ],
-)
-
-cc_library(
- name = "pack_common",
- hdrs = [
- "pack.h",
- "pack_arm.h",
- "pack_common.h",
- "pack_x86.h",
- ],
- copts = ruy_copts_base(),
- deps = [
- ":check_macros",
- ":common",
- ":internal_matrix",
- ":matrix",
- ":opt_set",
- ":path",
- ":platform",
- ":tune",
- "//tensorflow/lite/experimental/ruy/ruy/profiler:instrumentation",
- ],
-)
-
-cc_library(
- name = "kernel_arm",
- srcs = [
- "kernel_arm32.cc",
- "kernel_arm64.cc",
- ],
- copts = ruy_copts_base(),
- deps = [
- ":common",
- ":kernel_common",
- ":opt_set",
- ":platform",
- "//tensorflow/lite/experimental/ruy/ruy/profiler:instrumentation",
- ],
-)
-
-cc_library(
- name = "pack_arm",
- srcs = [
- "pack_arm.cc",
- ],
- copts = ruy_copts_base(),
- deps = [
- ":common",
- ":opt_set",
- ":pack_common",
- ":platform",
- "//tensorflow/lite/experimental/ruy/ruy/profiler:instrumentation",
- ],
-)
-
-# AVX-512 compilation units.
-#
-# These must use the same compiler options.
-RUY_COPTS_BUILT_FOR_AVX512 = ruy_copts_base() + ruy_copts_skylake()
-
-cc_library(
- name = "kernel_avx512",
- srcs = [
- "kernel_avx512.cc",
- ],
- copts = RUY_COPTS_BUILT_FOR_AVX512,
- deps = [
- ":check_macros",
- ":kernel_common",
- ":opt_set",
- ":platform",
- "//tensorflow/lite/experimental/ruy/ruy/profiler:instrumentation",
- ],
-)
-
-cc_library(
- name = "pack_avx512",
- srcs = [
- "pack_avx512.cc",
- ],
- copts = RUY_COPTS_BUILT_FOR_AVX512,
- deps = [
- ":check_macros",
- ":matrix",
- ":opt_set",
- ":pack_common",
- ":path",
- ":platform",
- "//tensorflow/lite/experimental/ruy/ruy/profiler:instrumentation",
- ],
-)
-
-cc_library(
- name = "have_built_path_for_avx512",
- srcs = [
- "have_built_path_for_avx512.cc",
- ],
- hdrs = [
- "have_built_path_for.h",
- ],
- copts = RUY_COPTS_BUILT_FOR_AVX512,
- deps = [
- ":opt_set",
- ":platform",
- ],
-)
-# End: AVX-512 compilation units.
-
-# AVX2 compilation units.
-#
-# These must use the same compiler options.
-RUY_COPTS_BUILT_FOR_AVX2 = ruy_copts_base() + ruy_copts_avx2()
-
-cc_library(
- name = "kernel_avx2",
- srcs = [
- "kernel_avx2.cc",
- ],
- copts = RUY_COPTS_BUILT_FOR_AVX2,
- deps = [
- ":check_macros",
- ":kernel_common",
- ":opt_set",
- ":platform",
- "//tensorflow/lite/experimental/ruy/ruy/profiler:instrumentation",
- ],
-)
-
-cc_library(
- name = "pack_avx2",
- srcs = [
- "pack_avx2.cc",
- ],
- copts = RUY_COPTS_BUILT_FOR_AVX2,
- deps = [
- ":check_macros",
- ":matrix",
- ":opt_set",
- ":pack_common",
- ":path",
- ":platform",
- "//tensorflow/lite/experimental/ruy/ruy/profiler:instrumentation",
- ],
-)
-
-cc_library(
- name = "have_built_path_for_avx2",
- srcs = [
- "have_built_path_for_avx2.cc",
- ],
- hdrs = [
- "have_built_path_for.h",
- ],
- copts = RUY_COPTS_BUILT_FOR_AVX2,
- deps = [
- ":opt_set",
- ":platform",
- ],
-)
-# End: AVX2 compilation units.
-
-# SSE42 compilation units.
-#
-# TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-# Optimization is not finished. In particular the dimensions of the kernel
-# blocks can be changed as desired.
-#
-# These must use the same compiler options.
-RUY_COPTS_BUILT_FOR_SSE42 = ruy_copts_base() + ruy_copts_sse42()
-
-cc_library(
- name = "kernel_sse42",
- srcs = [
- "kernel_sse42.cc",
- ],
- copts = RUY_COPTS_BUILT_FOR_SSE42,
- deps = [
- ":check_macros",
- ":kernel_common",
- ":opt_set",
- ":platform",
- "//tensorflow/lite/experimental/ruy/ruy/profiler:instrumentation",
- ],
-)
-
-cc_library(
- name = "pack_sse42",
- srcs = [
- "pack_sse42.cc",
- ],
- copts = RUY_COPTS_BUILT_FOR_SSE42,
- deps = [
- ":check_macros",
- ":matrix",
- ":opt_set",
- ":pack_common",
- ":path",
- ":platform",
- "//tensorflow/lite/experimental/ruy/ruy/profiler:instrumentation",
- ],
-)
-
-cc_library(
- name = "have_built_path_for_sse42",
- srcs = [
- "have_built_path_for_sse42.cc",
- ],
- hdrs = [
- "have_built_path_for.h",
- ],
- copts = RUY_COPTS_BUILT_FOR_SSE42,
- deps = [
- ":opt_set",
- ":platform",
- ],
-)
-# End: SSE42 compilation units.
-
-# AVX-VNNI compilation units.
-#
-# TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-# Optimization is not finished. In particular the dimensions of the kernel
-# blocks can be changed as desired.
-#
-# These must use the same compiler options.
-RUY_COPTS_BUILT_FOR_AVX_VNNI = ruy_copts_base() + ruy_copts_avxvnni()
-
-cc_library(
- name = "kernel_avxvnni",
- srcs = [
- "kernel_avxvnni.cc",
- ],
- copts = RUY_COPTS_BUILT_FOR_AVX_VNNI,
- deps = [
- ":check_macros",
- ":kernel_common",
- ":opt_set",
- ":platform",
- "//tensorflow/lite/experimental/ruy/ruy/profiler:instrumentation",
- ],
-)
-
-cc_library(
- name = "pack_avxvnni",
- srcs = [
- "pack_avxvnni.cc",
- ],
- copts = RUY_COPTS_BUILT_FOR_AVX_VNNI,
- deps = [
- ":check_macros",
- ":matrix",
- ":opt_set",
- ":pack_common",
- ":path",
- ":platform",
- "//tensorflow/lite/experimental/ruy/ruy/profiler:instrumentation",
- ],
-)
-
-cc_library(
- name = "have_built_path_for_avxvnni",
- srcs = [
- "have_built_path_for_avxvnni.cc",
- ],
- hdrs = [
- "have_built_path_for.h",
- ],
- copts = RUY_COPTS_BUILT_FOR_AVX_VNNI,
- deps = [
- ":opt_set",
- ":platform",
- ],
-)
-# End: AVX-VNNI compilation units.
-
-cc_library(
- name = "kernel",
- hdrs = [
- "kernel.h",
- "kernel_common.h",
- ],
- copts = ruy_copts_base(),
- deps = [
- ":check_macros",
- ":common",
- ":internal_matrix",
- ":kernel_arm", # fixdeps: keep
- ":kernel_avx2", # fixdeps: keep
- ":kernel_avx512", # fixdeps: keep
- ":kernel_avxvnni", # fixdeps: keep
- ":kernel_common",
- ":kernel_sse42", # fixdeps: keep
- ":matrix",
- ":opt_set",
- ":path",
- ":platform",
- ":side_pair",
- ":size_util",
- ":spec",
- ":tune",
- "//tensorflow/lite/experimental/ruy/ruy/profiler:instrumentation",
- ],
-)
-
-cc_library(
- name = "pack",
- hdrs = [
- "pack.h",
- "pack_common.h",
- ],
- copts = ruy_copts_base(),
- deps = [
- ":check_macros",
- ":common",
- ":internal_matrix",
- ":matrix",
- ":opt_set",
- ":pack_arm", # fixdeps: keep
- ":pack_avx2", # fixdeps: keep
- ":pack_avx512", # fixdeps: keep
- ":pack_avxvnni", # fixdeps: keep
- ":pack_common",
- ":pack_sse42", # fixdeps: keep
- ":path",
- ":platform",
- ":tune",
- "//tensorflow/lite/experimental/ruy/ruy/profiler:instrumentation",
- ],
-)
-
-cc_library(
- name = "have_built_path_for",
- hdrs = [
- "have_built_path_for.h",
- ],
- deps = [
- ":have_built_path_for_avx2",
- ":have_built_path_for_avx512",
- ":have_built_path_for_avxvnni",
- ":have_built_path_for_sse42",
- ":platform",
- ],
-)
-
-cc_library(
- name = "context",
- srcs = [
- "context.cc",
- ],
- hdrs = [
- "context.h",
- ],
- copts = ruy_copts_base(),
- deps = [
- ":allocator",
- ":check_macros",
- ":detect_arm",
- ":detect_x86",
- ":have_built_path_for",
- ":path",
- ":platform",
- ":prepacked_cache",
- ":thread_pool",
- ":trace",
- ":tune",
- ],
-)
-
-cc_test(
- name = "context_test",
- srcs = ["context_test.cc"],
- deps = [
- ":context",
- ":path",
- ":platform",
- "@com_google_googletest//:gtest",
- ],
-)
-
-cc_library(
- name = "trmul_params",
- hdrs = ["trmul_params.h"],
- copts = ruy_copts_base(),
- deps = [
- ":internal_matrix",
- ":side_pair",
- ":tune",
- ],
-)
-
-cc_library(
- name = "trmul",
- srcs = ["trmul.cc"],
- hdrs = ["trmul.h"],
- copts = ruy_copts_base(),
- deps = [
- ":allocator",
- ":block_map",
- ":check_macros",
- ":common",
- ":context",
- ":internal_matrix",
- ":matrix",
- ":opt_set",
- ":side_pair",
- ":size_util",
- ":spec",
- ":thread_pool",
- ":trace",
- ":trmul_params",
- ":tune",
- "//tensorflow/lite/experimental/ruy/ruy/profiler:instrumentation",
- ],
-)
-
-# The main library.
-cc_library(
- name = "ruy",
- srcs = [
- "dispatch.h",
- "prepack.h",
- ],
- hdrs = [
- "ruy.h",
- "ruy_advanced.h",
- ],
- copts = ruy_copts_base(),
- deps = [
- ":check_macros",
- ":common",
- ":context",
- ":internal_matrix",
- ":kernel",
- ":matrix",
- ":opt_set",
- ":pack",
- ":path",
- ":prepacked_cache",
- ":side_pair",
- ":size_util",
- ":spec",
- ":trmul",
- ":trmul_params",
- ":tune",
- "//tensorflow/lite/experimental/ruy/ruy/profiler:instrumentation",
- ],
-)
-
-# Usage examples.
-cc_binary(
- name = "example",
- srcs = ["example.cc"],
- deps = [":ruy"],
-)
-
-# Usage examples of the advanced API.
-cc_binary(
- name = "example_advanced",
- srcs = ["example_advanced.cc"],
- deps = [":ruy"],
-)
-
-# Small library to query PMU counters, for benchmark only
-cc_library(
- name = "pmu",
- testonly = True,
- srcs = ["pmu.cc"],
- hdrs = ["pmu.h"],
- copts = ruy_copts_base(),
- deps = [":check_macros"],
-)
-
-# Testing framework.
-cc_library(
- name = "test_lib",
- testonly = True,
- hdrs = ["test.h"],
- copts = ruy_copts_base(),
- # need defines, not copts, because it's controlling a header, test.h
- defines = ruy_test_ext_defines(),
- linkopts = select({
- ":windows": [],
- "//conditions:default": ["-lm"],
- }),
- deps = [
- ":matrix",
- ":pmu",
- ":ruy",
- ":spec",
- ":time",
- "@com_google_googletest//:gtest",
- ":platform",
- "//tensorflow/lite/experimental/ruy/ruy/profiler:profiler",
- ] + ruy_test_ext_deps(),
-)
-
-ruy_benchmark(
- name = "benchmark",
- srcs = ["benchmark.cc"],
- copts = ruy_copts_base(),
- lhs_rhs_accum_dst = [
- ("f32", "f32", "f32", "f32"),
- ("u8", "u8", "i32", "u8"),
- ("i8", "i8", "i32", "u8"),
- ("i8", "i8", "i32", "i8"),
- ("u8", "u8", "i32", "i16"),
- ("i8", "i8", "i32", "i32"),
- ],
- deps = [
- "//tensorflow/lite/experimental/ruy/ruy:test_lib",
- "//tensorflow/lite/experimental/ruy/ruy/profiler:instrumentation",
- ],
-)
-
-ruy_test(
- name = "test_fast",
- srcs = ["test_fast.cc"],
- copts = ruy_copts_base(),
- lhs_rhs_accum_dst = [
- ("f32", "f32", "f32", "f32"),
- ("f64", "f32", "f64", "f32"),
- ("f32", "f64", "f64", "f64"),
- ("u8", "u8", "i32", "u8"),
- ("i8", "i8", "i32", "i8"),
- ("i8", "u8", "i32", "i8"),
- ("u8", "u8", "i32", "i16"),
- ("i8", "i8", "i32", "i32"),
- ("i8", "u8", "i32", "i32"),
- ],
- deps = [
- "//tensorflow/lite/experimental/ruy/ruy:test_lib",
- "@com_google_googletest//:gtest_main",
- ],
-)
-
-ruy_test(
- name = "test_slow",
- srcs = ["test_slow.cc"],
- copts = ruy_copts_base(),
- lhs_rhs_accum_dst = [
- ("f32", "f32", "f32", "f32"),
- ("u8", "u8", "i32", "u8"),
- ("i8", "i8", "i32", "i8"),
- ("u8", "u8", "i32", "i16"),
- ("i8", "i8", "i32", "i32"),
- ],
- tags = ["slow"],
- deps = [
- "//tensorflow/lite/experimental/ruy/ruy:test_lib",
- "@com_google_googletest//:gtest_main",
- ],
-)
-
-ruy_test(
- name = "test_special_specs",
- srcs = ["test_special_specs.cc"],
- copts = ruy_copts_base(),
- lhs_rhs_accum_dst = [
- ("f32", "f32", "f32", "f32"),
- ("u8", "u8", "i32", "u8"),
- ("u8", "u8", "i32", "i16"),
- ],
- deps = [
- "//tensorflow/lite/experimental/ruy/ruy:test_lib",
- "@com_google_googletest//:gtest_main",
- ],
-)
diff --git a/tensorflow/lite/experimental/ruy/ruy/allocator.cc b/tensorflow/lite/experimental/ruy/ruy/allocator.cc
deleted file mode 100644
index 2c507561f2f..00000000000
--- a/tensorflow/lite/experimental/ruy/ruy/allocator.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/experimental/ruy/ruy/allocator.h"
-
-#include
-#include
-
-#ifdef _WIN32
-#include
-#endif
-
-namespace ruy {
-
-namespace detail {
-
-void *SystemAlignedAlloc(std::ptrdiff_t num_bytes) {
-#ifdef _WIN32
- return _aligned_malloc(num_bytes, kMinimumBlockAlignment);
-#else
- void *ptr;
- if (posix_memalign(&ptr, kMinimumBlockAlignment, num_bytes)) {
- return nullptr;
- }
- return ptr;
-#endif
-}
-
-void SystemAlignedFree(void *ptr) {
-#ifdef _WIN32
- _aligned_free(ptr);
-#else
- free(ptr);
-#endif
-}
-
-} // namespace detail
-
-} // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/ruy/allocator.h b/tensorflow/lite/experimental/ruy/ruy/allocator.h
deleted file mode 100644
index 56aa0eef8f9..00000000000
--- a/tensorflow/lite/experimental/ruy/ruy/allocator.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_ALLOCATOR_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_ALLOCATOR_H_
-
-#include
-#include
-#include
-#include
-
-#include "tensorflow/lite/experimental/ruy/ruy/check_macros.h"
-#include "tensorflow/lite/experimental/ruy/ruy/size_util.h"
-
-namespace ruy {
-
-namespace detail {
-
-inline void* VoidPtrAdd(void* p, std::ptrdiff_t offset) {
- RUY_DCHECK(p);
- std::uintptr_t addr = reinterpret_cast(p) + offset;
- return reinterpret_cast(addr);
-}
-
-// Minimum alignment for blocks.
-//
-// Considerations:
-// - This needs to be at least the alignment of any usual data type.
-// - It's useful that this is at least the size of a cache line to limit
-// possible cache side effects (if only on performance behavior).
-// - It's useful that this is at least the size of SIMD registers, as
-// some SIMD instruction sets have at least performance behavior
-// differences (e.g. NEON) or even different requirements (e.g. SSE)
-// based on that.
-// - It's useful that this is at least the size of an "exclusive reservation
-// granule" on ARM, meaning that if we use this Allocator to allocate
-// an atomic variable, there will be no side effects from other things
-// contending for exclusive/atomic memory accesses to it. While the
-// ARM reference manual mentions that this granule size may be as large
-// as 2048 bytes, in practice we observe it to be 64 bytes. It can
-// be queried cheaply, at runtime, from userspace, if needed.
-static constexpr std::ptrdiff_t kMinimumBlockAlignment = 64;
-
-// Primitive allocation functions obtaining aligned memory from the
-// operating system.
-void* SystemAlignedAlloc(std::ptrdiff_t num_bytes);
-void SystemAlignedFree(void* ptr);
-
-// Specialized allocator designed to converge to a steady-state where all
-// allocations are bump-ptr allocations from an already-allocated buffer.
-//
-// To support these constraints, this allocator only supports two
-// operations.
-// - AllocateAlignedBytes: allocates a pointer to storage of a specified
-// size, which must be aligned to kMinimumBlockAlignment.
-// - FreeAll: frees all previous allocations (but retains the internal
-// buffer to minimize future calls into the system allocator).
-//
-// This class is specialized for supporting just those two operations
-// under this specific steady-state usage pattern. Extending this class
-// with new allocation interfaces that don't fit that pattern is probably not
-// the right choice. Instead, build a new class on top of
-// SystemAlignedAlloc/SystemAlignedFree.
-//
-// All operations happen on aligned blocks for simplicity.
-class AlignedAllocator {
- public:
- void operator=(const AlignedAllocator&) = delete;
- ~AlignedAllocator() {
- FreeAll();
- SystemAlignedFree(ptr_);
- }
-
- void* AllocateAlignedBytes(std::ptrdiff_t num_bytes) {
- RUY_DCHECK_GT(num_bytes, 0);
- RUY_DCHECK((num_bytes & (kMinimumBlockAlignment - 1)) == 0);
- if (void* p = AllocateFast(num_bytes)) {
- return p;
- }
- return AllocateSlow(num_bytes);
- }
-
- void FreeAll() {
- current_ = 0;
- if (fallback_blocks_.empty()) {
- return;
- }
-
- // No rounding-up of the size means linear instead of logarithmic
- // bound on the number of allocation in some worst-case calling patterns.
- // This is considered worth it because minimizing memory usage is important
- // and actual calling patterns in applications that we care about still
- // reach the no-further-allocations steady state in a small finite number
- // of iterations.
- std::ptrdiff_t new_size = size_ + fallback_blocks_total_size_;
- SystemAlignedFree(ptr_);
- ptr_ = SystemAlignedAlloc(new_size);
- size_ = new_size;
-
- for (void* p : fallback_blocks_) {
- SystemAlignedFree(p);
- }
- fallback_blocks_.clear();
- fallback_blocks_total_size_ = 0;
- }
-
- private:
- void* AllocateFast(std::ptrdiff_t num_bytes) {
- if (current_ + num_bytes > size_) {
- return nullptr;
- }
- void* ret = VoidPtrAdd(ptr_, current_);
- current_ += num_bytes;
- return ret;
- }
-
- void* AllocateSlow(std::ptrdiff_t num_bytes) {
- void* p = SystemAlignedAlloc(num_bytes);
- fallback_blocks_total_size_ += num_bytes;
- fallback_blocks_.push_back(p);
- return p;
- }
-
- // Theory of operation:
- //
- // - ptr_, current_, and size_ implement a basic bump-ptr allocator.
- //
- // - in AllocateAlignedBytes, the fast path is just a bump-ptr
- // allocation. If our bump-ptr allocator doesn't have enough space for an
- // allocation, then we allocate a block from the system allocator to
- // service the allocation request. We save that block in fallback_blocks_
- // and track the total size of the fallback blocks in
- // fallback_blocks_total_size_.
- //
- // - in FreeAll, the fast path just resets the bump-ptr allocator. If
- // there are any fallback blocks, we free them and reallocate the
- // bump-ptr allocator's buffer so that the next sequence of allocations
- // will hopefully not need any fallback blocks.
- void* ptr_ = nullptr;
- std::ptrdiff_t current_ = 0;
- std::ptrdiff_t size_ = 0;
- std::vector fallback_blocks_;
- std::ptrdiff_t fallback_blocks_total_size_ = 0;
-};
-
-} // namespace detail
-
-// The main Allocator class, with a convenient interface for allocating a
-// typed buffer.
-class Allocator {
- public:
- void* AllocateBytes(std::ptrdiff_t num_bytes) {
- if (num_bytes == 0) {
- return nullptr;
- }
- return aligned.AllocateAlignedBytes(
- round_up_pot(num_bytes, detail::kMinimumBlockAlignment));
- }
- template
- void Allocate(std::ptrdiff_t count, Pointer* out) {
- using T = typename std::pointer_traits::element_type;
- *out = static_cast(AllocateBytes(count * sizeof(T)));
- }
-
- void FreeAll() { aligned.FreeAll(); }
-
- private:
- detail::AlignedAllocator aligned;
-};
-
-} // namespace ruy
-
-#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_ALLOCATOR_H_
diff --git a/tensorflow/lite/experimental/ruy/ruy/allocator_test.cc b/tensorflow/lite/experimental/ruy/ruy/allocator_test.cc
deleted file mode 100644
index 1584b86b4cc..00000000000
--- a/tensorflow/lite/experimental/ruy/ruy/allocator_test.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/experimental/ruy/ruy/allocator.h"
-
-#include
-
-namespace ruy {
-namespace {
-
-TEST(AllocatorTest, ReturnsValidMemory) {
- Allocator allocator;
- int *p;
- allocator.Allocate(1, &p);
- ASSERT_NE(p, nullptr);
-
- // If this is bogus memory, ASan will cause this test to fail.
- *p = 42;
-
- allocator.FreeAll();
-}
-
-TEST(AllocatorTest, NoLeak) {
- Allocator allocator;
- // Allocate and free some ridiculously large total amount of memory, so
- // that a leak will hopefully cause some sort of resource exhaustion.
- //
- // Despite the large number of allocations, this test is actually quite
- // fast, since our fast-path allocation logic is very fast.
- constexpr int kNumAllocations = 100 * 1024;
- constexpr int kAllocationSize = 1024 * 1024;
- for (int i = 0; i < kNumAllocations; i++) {
- char *p;
- allocator.Allocate(kAllocationSize, &p);
- allocator.FreeAll();
- }
-}
-
-TEST(AllocatorTest, IncreasingSizes) {
- Allocator allocator;
- // Allocate sizes that increase by small amounts across FreeAll calls.
- for (int i = 1; i < 100 * 1024; i++) {
- char *p;
- allocator.Allocate(i, &p);
- allocator.FreeAll();
- }
-}
-
-TEST(AllocatorTest, ManySmallAllocations) {
- Allocator allocator;
- // Allocate many small allocations between FreeAll calls.
- for (int i = 0; i < 10 * 1024; i += 100) {
- for (int j = 0; j < i; j++) {
- char *p;
- allocator.Allocate(1, &p);
- }
- allocator.FreeAll();
- }
-}
-
-TEST(AllocatorTest, DestructorHandlesMainBumpPtr) {
- // This is a white-box test.
- Allocator allocator;
- allocator.AllocateBytes(1);
- allocator.FreeAll();
- // After the call to FreeAll, the allocator will consolidate all of the memory
- // into the main bump-ptr allocator's block, which we then expect to be freed
- // in the destructor.
- //
- // We have no test assertions -- we primarily expect that this trigger a leak
- // checker and cause the test to fail.
-}
-
-TEST(AllocatorTest, DestructorHandlesFallbackBlocks) {
- // This is a white-box test.
- Allocator allocator;
- // Since we just created the allocator, this will allocate a fallback block,
- // which we then expect to be freed in the destructor.
- //
- // We have no test assertions -- we primarily expect that this trigger a leak
- // checker and cause the test to fail.
- allocator.AllocateBytes(1);
-}
-
-} // namespace
-} // namespace ruy
-
-int main(int argc, char **argv) {
- ::testing::InitGoogleTest(&argc, argv);
- return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/experimental/ruy/ruy/benchmark.cc b/tensorflow/lite/experimental/ruy/ruy/benchmark.cc
deleted file mode 100644
index 406345cec06..00000000000
--- a/tensorflow/lite/experimental/ruy/ruy/benchmark.cc
+++ /dev/null
@@ -1,196 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include
-#include
-#include
-
-#include "tensorflow/lite/experimental/ruy/ruy/test.h"
-
-namespace ruy {
-
-using LhsScalar = RUY_TEST_LHSSCALAR;
-using RhsScalar = RUY_TEST_RHSSCALAR;
-using AccumScalar = RUY_TEST_ACCUMSCALAR;
-using DstScalar = RUY_TEST_DSTSCALAR;
-using TestSetType =
- TestSet>;
-
-struct BenchmarkShape {
- int rows;
- int depth;
- int cols;
- int symm_lhs;
- int symm_rhs;
-};
-
-template
-std::vector>> BenchmarkRCC(
- const BenchmarkShape& shape) {
- TestSetType test_set;
- test_set.rows = shape.rows;
- test_set.depth = shape.depth;
- test_set.cols = shape.cols;
- test_set.lhs_order = Order::kRowMajor;
- test_set.rhs_order = Order::kColMajor;
- test_set.dst_order = Order::kColMajor;
- test_set.layout_style = LayoutStyle::kPackedLinear;
- test_set.benchmark = true;
- const int asymmetry_lhs = shape.symm_lhs ? 0 : 1;
- const int asymmetry_rhs = shape.symm_rhs ? 0 : 1;
- test_set.lhs_zero_point = SymmetricZeroPoint() + asymmetry_lhs;
- test_set.rhs_zero_point = SymmetricZeroPoint() + asymmetry_rhs;
- test_set.use_specified_zero_points = true;
- test_set.perchannel = GetBoolEnvVarOrFalse("PERCHANNEL");
- test_set.benchmark_prepack_lhs = GetBoolEnvVarOrFalse("PREPACK_LHS");
- test_set.benchmark_prepack_rhs = GetBoolEnvVarOrFalse("PREPACK_RHS");
- test_set.Run();
- return std::move(test_set.results);
-}
-
-std::vector ParseCommaSeparatedInts(
- const std::string& comma_separated_ints) {
- std::vector result;
- for (std::size_t pos = 0; pos < comma_separated_ints.size();) {
- std::size_t delim_pos = comma_separated_ints.find(',', pos);
- if (delim_pos == std::string::npos) {
- delim_pos = comma_separated_ints.size();
- }
- result.push_back(
- std::stoi(comma_separated_ints.substr(pos, delim_pos - pos)));
- pos = delim_pos + 1;
- }
- return result;
-}
-
-void Benchmark() {
- const bool symm_lhs = std::is_floating_point::value ||
- GetBoolEnvVarOrFalse("SYMM_LHS");
- const bool symm_rhs = std::is_floating_point::value ||
- GetBoolEnvVarOrFalse("SYMM_RHS");
- const bool benchmark_cubic = GetBoolEnvVarOrFalse("RUY_BENCHMARK_CUBIC") ||
- GetBoolEnvVarOrFalse("RUY_BENCHMARK_CUBIC_LIST");
- const int explicit_rows = GetIntEnvVarOrZero("ROWS");
- const int explicit_cols = GetIntEnvVarOrZero("COLS");
- const int explicit_depth = GetIntEnvVarOrZero("DEPTH");
-
- std::vector shapes;
-
- if (benchmark_cubic) {
- std::vector sizes;
- const char* benchmark_cubic_list_env = getenv("RUY_BENCHMARK_CUBIC_LIST");
- if (benchmark_cubic_list_env) {
- sizes = ParseCommaSeparatedInts(benchmark_cubic_list_env);
- } else {
- // Often 8 is used for this multiplier, but to check teeny sizes one can
- // use 1.
- static constexpr int cubic_size_multiplier = 8;
- for (int i = 2 * cubic_size_multiplier;
- i <= (512 * cubic_size_multiplier); i *= 2) {
- sizes.push_back(i);
- if (i < (512 * cubic_size_multiplier)) {
- sizes.push_back(i * 3 / 2);
- }
- }
- }
- for (int i : sizes) {
- BenchmarkShape shape;
- // Even in cubic mode, one may still override an individual dimension
- // to allow testing a batch of rectangular sizes.
- shape.rows = explicit_rows ? explicit_rows : i;
- shape.cols = explicit_cols ? explicit_cols : i;
- shape.depth = explicit_depth ? explicit_depth : i;
- shape.symm_lhs = symm_lhs;
- shape.symm_rhs = symm_rhs;
- shapes.push_back(shape);
- }
- } else {
- BenchmarkShape shape;
- shape.rows = explicit_rows;
- shape.cols = explicit_cols;
- shape.depth = explicit_depth;
- if (!shape.rows || !shape.depth || !shape.cols) {
- fprintf(stderr,
- "Please specify positive sizes with these env vars: ROWS, DEPTH, "
- "COLS.\n");
- exit(1);
- }
- shape.symm_lhs = symm_lhs;
- shape.symm_rhs = symm_rhs;
- shapes.push_back(shape);
- }
-
- for (int i = 0; i < shapes.size(); i++) {
- const auto& shape = shapes[i];
- const auto& results = BenchmarkRCC(shape);
- if (i == 0) {
- if (benchmark_cubic) {
- printf("size");
- for (const auto& result : results) {
- if (results.size() > 1) {
- printf(",%s:Gop/s", PathName(*result).c_str());
- } else {
- printf(",Gop/s");
- }
- if (GetBoolEnvVarOrFalse("RUY_BENCHMARK_PMU")) {
- printf(
- ",l1_refill,l2_refill,l3_refill,l1tlb_refill,l2tlb_refill,"
- "mispred,frontend_stall,backend_stall");
- }
- }
- printf("\n");
- } else {
- printf("path,shape,Gop/s\n");
- }
- fflush(stdout);
- }
- if (benchmark_cubic) {
- printf("%d", shape.rows);
- for (const auto& result : results) {
- printf(",%.4g", 2.0e-9 * shape.rows * shape.cols * shape.depth /
- result->latency);
- if (GetBoolEnvVarOrFalse("RUY_BENCHMARK_PMU")) {
- printf(",%.3g,%.3g,%.3g,%.3g,%.3g,%.3g,%.3g,%.3g",
- result->l1_refill_rate, result->l2_refill_rate,
- result->l3_refill_rate, result->l1tlb_refill_rate,
- result->l2tlb_refill_rate, result->mispred_rate,
- result->frontend_stall_rate, result->backend_stall_rate);
- }
- }
- printf("\n");
- fflush(stdout);
- } else {
- for (const auto& result : results) {
- printf(
- "%s,%dx%dx%d,%.4g", PathName(*result).c_str(), shape.rows,
- shape.depth, shape.cols,
- 2.0e-9 * shape.rows * shape.cols * shape.depth / result->latency);
- if (GetBoolEnvVarOrFalse("RUY_BENCHMARK_PMU")) {
- printf(",%.3g,%.3g,%.3g,%.3g,%.3g,%.3g,%.3g,%.3g",
- result->l1_refill_rate, result->l2_refill_rate,
- result->l3_refill_rate, result->l1tlb_refill_rate,
- result->l2tlb_refill_rate, result->mispred_rate,
- result->frontend_stall_rate, result->backend_stall_rate);
- }
- printf("\n");
- }
- fflush(stdout);
- }
- }
-}
-
-} // namespace ruy
-
-int main() { ruy::Benchmark(); }
diff --git a/tensorflow/lite/experimental/ruy/ruy/block_map.cc b/tensorflow/lite/experimental/ruy/ruy/block_map.cc
deleted file mode 100644
index 32781d82ad3..00000000000
--- a/tensorflow/lite/experimental/ruy/ruy/block_map.cc
+++ /dev/null
@@ -1,486 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/experimental/ruy/ruy/block_map.h"
-
-#include
-#include
-
-#ifdef RUY_MAKEBLOCKMAP_DEBUG
-#include
-#include
-#include
-#endif
-
-#include "tensorflow/lite/experimental/ruy/ruy/check_macros.h"
-#include "tensorflow/lite/experimental/ruy/ruy/opt_set.h"
-#include "tensorflow/lite/experimental/ruy/ruy/profiler/instrumentation.h"
-#include "tensorflow/lite/experimental/ruy/ruy/size_util.h"
-
-namespace ruy {
-
-namespace {
-
-void DecodeTraversalLinear(int size_log2, std::uint32_t square_index,
- SidePair* local_pos) {
- (*local_pos)[Side::kLhs] = square_index & ((1 << size_log2) - 1);
- (*local_pos)[Side::kRhs] = square_index >> size_log2;
-}
-
-void DecodeTraversalFractalZ(std::uint32_t square_index,
- SidePair* local_pos) {
- const std::uint32_t n1 = square_index;
- const std::uint32_t n2 = (n1 & 0x99999999u) | ((n1 & 0x44444444u) >> 1) |
- ((n1 & 0x22222222u) << 1);
- const std::uint32_t n4 = (n2 & 0xc3c3c3c3u) | ((n2 & 0x30303030u) >> 2) |
- ((n2 & 0x0c0c0c0cu) << 2);
- const std::uint32_t n8 = (n4 & 0xf00ff00fu) | ((n4 & 0x0f000f00u) >> 4) |
- ((n4 & 0x00f000f0u) << 4);
- const std::uint32_t n16 = (n8 & 0xff0000ffu) | ((n8 & 0x00ff0000u) >> 8) |
- ((n8 & 0x0000ff00u) << 8);
- (*local_pos)[Side::kLhs] = n16 & 0xffff;
- (*local_pos)[Side::kRhs] = n16 >> 16;
-}
-
-void DecodeTraversalFractalU(std::uint32_t square_index,
- SidePair* local_pos) {
- DecodeTraversalFractalZ(square_index, local_pos);
- // Change fractal z-order to u-order
- (*local_pos)[Side::kLhs] ^= (*local_pos)[Side::kRhs];
-}
-
-// Code inspired by the sample code in
-// https://en.wikipedia.org/wiki/Hilbert_curve
-// The main optimization is to avoid hard-to-predict conditional branches
-// based on the bits of the square_index parameter.
-void DecodeTraversalFractalHilbert(int size_log2, std::uint32_t square_index,
- SidePair* local_pos) {
- std::uint32_t t = square_index;
- std::uint32_t x = 0;
- std::uint32_t y = 0;
- // Easy-to-predict for loop, the number of iterations is the same for
- // an entire GEMM.
- for (int sb = 0; sb < size_log2; sb++) {
- std::uint32_t s = 1 << sb;
- bool rx = t & 2;
- bool ry = (t & 1) ^ rx;
- std::uint32_t tmp = rx ? (s - 1 - x) : x;
- x = ry ? x : rx ? (s - 1 - y) : y;
- y = ry ? (y + s) : tmp;
- x = rx ? (x + s) : x;
- t >>= 2;
- }
- (*local_pos)[Side::kLhs] = y;
- (*local_pos)[Side::kRhs] = x;
-}
-
-} // end anonymous namespace
-
-void GetBlockByIndex(const BlockMap& block_map, int index,
- SidePair* block) {
- profiler::ScopeLabel label("GetBlockByIndex");
- const std::uint32_t index_u32 = index;
-
- const std::uint32_t num_blocks_per_local_curve =
- 1u << (2 * block_map.num_blocks_base_log2);
- const std::uint32_t square_index =
- index_u32 & (num_blocks_per_local_curve - 1);
-
- const int size_log2 = block_map.num_blocks_base_log2;
- SidePair local_pos;
- switch (block_map.traversal_order) {
- case BlockMapTraversalOrder::kFractalZ:
- DecodeTraversalFractalZ(square_index, &local_pos);
- break;
- case BlockMapTraversalOrder::kFractalU:
- DecodeTraversalFractalU(square_index, &local_pos);
- break;
- case BlockMapTraversalOrder::kFractalHilbert:
- DecodeTraversalFractalHilbert(size_log2, square_index, &local_pos);
- break;
- default:
- RUY_DCHECK(block_map.traversal_order == BlockMapTraversalOrder::kLinear);
- DecodeTraversalLinear(size_log2, square_index, &local_pos);
- break;
- }
-
- const std::uint32_t rectangular_index =
- index_u32 >> 2 * block_map.num_blocks_base_log2;
- for (Side side : {Side::kLhs, Side::kRhs}) {
- const std::uint32_t mask = (1u << block_map.rectangularness_log2[side]) - 1;
- const int rectangular_offset = (rectangular_index & mask)
- << block_map.num_blocks_base_log2;
- (*block)[side] = local_pos[side] + rectangular_offset;
- }
-}
-
-BlockMapTraversalOrder GetTraversalOrder(int rows, int cols, int depth,
- int lhs_scalar_size,
- int rhs_scalar_size,
- int local_data_cache_size,
- int shared_data_cache_size) {
- const int kFractalOptSets =
- RUY_OPT_FRACTAL_Z | RUY_OPT_FRACTAL_U | RUY_OPT_FRACTAL_HILBERT;
- const int working_set_size =
- (lhs_scalar_size * rows + rhs_scalar_size * cols) * depth;
- if (RUY_OPT_ENABLED(kFractalOptSets) &&
- (working_set_size > local_data_cache_size)) {
- if (RUY_OPT_ENABLED(RUY_OPT_FRACTAL_HILBERT) &&
- (working_set_size > shared_data_cache_size)) {
- return BlockMapTraversalOrder::kFractalHilbert;
- } else if (RUY_OPT_ENABLED(RUY_OPT_FRACTAL_U)) {
- return BlockMapTraversalOrder::kFractalU;
- } else {
- return BlockMapTraversalOrder::kFractalZ;
- }
- } else {
- return BlockMapTraversalOrder::kLinear;
- }
-}
-
-namespace {
-
-int floor_log2_quotient(int num, int denom) {
- if (num <= denom) {
- return 0;
- }
- int log2_quotient = floor_log2(num) - ceil_log2(denom);
- if ((denom << (log2_quotient + 1)) <= num) {
- log2_quotient++;
- }
- return log2_quotient;
-}
-
-// Computes the rectangularness of the matrix shape (rows, cols). This is
-// essentially just the log2 of the quotient (rows / cols). The kernel_rows and
-// kernel_cols only get into the picture for clamping bounds but don't affect
-// the generic computation.
-void GetRectangularness(int rows, int cols, int kernel_rows, int kernel_cols,
- int* rows_rectangularness_log2,
- int* cols_rectangularness_log2) {
- *rows_rectangularness_log2 = 0;
- *cols_rectangularness_log2 = 0;
-
- // In GEMV-ish cases, that is when kernel blocks are as narrow as the kernel
- // itself, we risk having too small kernel blocks for good kernel
- // amortization. We avoid that by limiting recangularness so that kernel
- // blocks are not too tiny at least in that dimension. Specifically, we try to
- // have at least (2^min_kernel_inner_loop_runs_log2) kernels fitting in each
- // kernel block along the large dimension.
- const int min_kernel_inner_loop_runs_log2 = 3;
- if (rows > cols) {
- int cols_of_kernel_inner_loop_runs_log2 =
- ceil_log2(cols) - pot_log2(kernel_cols);
- int min_rows_of_kernel_inner_loop_runs_log2 =
- std::max(0, min_kernel_inner_loop_runs_log2 -
- cols_of_kernel_inner_loop_runs_log2);
- *rows_rectangularness_log2 =
- std::min(floor_log2_quotient(rows, cols),
- std::max(0, floor_log2(rows) - pot_log2(kernel_rows) -
- min_rows_of_kernel_inner_loop_runs_log2));
- // Sanity check that we did not over-estimate rows_rectangularness_log2.
- RUY_DCHECK_GE(rows >> *rows_rectangularness_log2, cols);
- } else if (cols > rows) {
- int rows_of_kernel_inner_loop_runs_log2 =
- ceil_log2(rows) - pot_log2(kernel_rows);
- int min_cols_of_kernel_inner_loop_runs_log2 =
- std::max(0, min_kernel_inner_loop_runs_log2 -
- rows_of_kernel_inner_loop_runs_log2);
- *cols_rectangularness_log2 =
- std::min(floor_log2_quotient(cols, rows),
- std::max(0, floor_log2(cols) - pot_log2(kernel_cols) -
- min_cols_of_kernel_inner_loop_runs_log2));
- // Sanity check that we did not over-estimate cols_rectangularness_log2.
- RUY_DCHECK_GE(cols >> *cols_rectangularness_log2, rows);
- }
- RUY_DCHECK(!*rows_rectangularness_log2 || !*cols_rectangularness_log2);
-}
-
-// Computes a 'multithreading score'. When multithreading, we need there to
-// be at least as many tiles as there are threads, and hopefully
-// substantially more than that, so we benefit from ruy's ability to
-// dispatch fine-grained workloads to threads.
-int GetMultithreadingScore(int block_size_log2, int rows, int cols,
- int tentative_thread_count) {
- const int num_full_blocks_of_rows = rows >> block_size_log2;
- const int num_full_blocks_of_cols = cols >> block_size_log2;
- const int candidate_num_full_blocks_log2 = floor_log2(
- std::max(1, num_full_blocks_of_rows * num_full_blocks_of_cols));
-
- // The values here have been tuned on ARM Cortex-A55.
- // We expect this to have to be tuned differently for other CPUs.
- if (tentative_thread_count == 1) {
- return 0;
- } else {
- const int blocks_per_thread_log2 =
- candidate_num_full_blocks_log2 - ceil_log2(tentative_thread_count);
- if (blocks_per_thread_log2 < 0) {
- return -64;
- } else if (blocks_per_thread_log2 == 0) {
- return -16;
- } else if (blocks_per_thread_log2 == 1) {
- return -8;
- } else if (blocks_per_thread_log2 == 2) {
- return 0;
- } else if (blocks_per_thread_log2 == 3) {
- return 8;
- } else {
- return 16;
- }
- }
-}
-
-// Computes a 'cache locality score'.
-int GetCacheLocalityScore(int block_size_log2, int rows, int cols, int depth,
- int kernel_rows_log2, int kernel_cols_log2,
- int lhs_scalar_size, int rhs_scalar_size, Path path,
- int local_data_cache_size) {
- // In the narrow case (e.g. matrix*vector), each byte of the big operand
- // matrix (either LHS or RHS) is traversed only once, so any notion of data
- // locality is irrelevant. Ignore the 'cache locality score' by forcing it to
- // be 0 in that case.
- if (rows <= (1 << kernel_rows_log2) || cols <= (1 << kernel_cols_log2)) {
- return 0;
- }
- const int block_rows = std::min(1 << block_size_log2, rows);
- const int block_cols = std::min(1 << block_size_log2, cols);
- const int total_read_bytes =
- (lhs_scalar_size * block_rows + rhs_scalar_size * block_cols) * depth;
- const int total_read_bytes_log2 = ceil_log2(total_read_bytes);
- const int nonlocality_log2 =
- total_read_bytes_log2 - floor_log2(local_data_cache_size);
- // The values here have been tuned on ARM Cortex-A55.
- // We expect this to have to be tuned differently for other CPUs.
- if (nonlocality_log2 < -1) {
- return 64;
- } else if (nonlocality_log2 == -1) {
- return 56;
- } else if (nonlocality_log2 == 0) {
- return 48;
- } else if (nonlocality_log2 == 1) {
- return 32;
- } else if (nonlocality_log2 == 2) {
- return 16;
- } else if (nonlocality_log2 == 3) {
- return 0;
- } else {
- return -64;
- }
-}
-
-// Compute a 'kernel amortization score'. This is the notion that very small
-// tiles result in more overhead outside of kernels, more complex memory
-// access patterns and less benefits from ruy's fat kernels, so we reward
-// larger blocks more than smaller ones.
-int GetKernelAmortizationScore(int block_size_log2, int rows, int cols,
- int kernel_rows_log2, int kernel_cols_log2) {
- const int block_rows = std::min(1 << block_size_log2, rows);
- const int block_cols = std::min(1 << block_size_log2, cols);
- const int kernels_per_block_log2 =
- floor_log2(block_rows * block_cols) - kernel_rows_log2 - kernel_cols_log2;
- RUY_DCHECK_GE(kernels_per_block_log2, 0);
- // The values here have been tuned on ARM Cortex-A55.
- // We expect this to have to be tuned differently for other CPUs.
- if (kernels_per_block_log2 == 0) {
- return 0;
- } else if (kernels_per_block_log2 == 1) {
- return 8;
- } else if (kernels_per_block_log2 == 2) {
- return 16;
- } else if (kernels_per_block_log2 == 3) {
- return 24;
- } else if (kernels_per_block_log2 == 4) {
- return 32;
- } else if (kernels_per_block_log2 == 5) {
- return 40;
- } else if (kernels_per_block_log2 == 6) {
- return 48;
- } else if (kernels_per_block_log2 == 7) {
- return 56;
- } else {
- return 64;
- }
-}
-
-} // namespace
-
-void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
- int kernel_cols, int lhs_scalar_size, int rhs_scalar_size,
- int tentative_thread_count, Path path,
- int local_data_cache_size, int shared_data_cache_size,
- BlockMap* block_map) {
- profiler::ScopeLabel label("MakeBlockMap");
-
-#ifdef RUY_MAKEBLOCKMAP_DEBUG
-#if RUY_MAKEBLOCKMAP_DEBUG >= 2
- static constexpr bool debug_everytime = true;
-#else
- static constexpr bool debug_everytime = false;
-#endif
- static bool firsttime = true;
- if (firsttime || debug_everytime) {
- fprintf(stderr,
- "MakeBlockMap(rows=%d, cols=%d, depth=%d, kernel_rows=%d, "
- "kernel_cols=%d, lhs_scalar_size=%d, rhs_scalar_size=%d, "
- "tentative_thread_count=%d)\n",
- rows, cols, depth, kernel_rows, kernel_cols, lhs_scalar_size,
- rhs_scalar_size, tentative_thread_count);
- }
-#endif
-
- RUY_DCHECK_GE(rows, kernel_rows);
- RUY_DCHECK_GE(cols, kernel_cols);
- RUY_DCHECK_EQ(rows % kernel_rows, 0);
- RUY_DCHECK_EQ(cols % kernel_cols, 0);
-
- block_map->traversal_order =
- GetTraversalOrder(rows, cols, depth, lhs_scalar_size, rhs_scalar_size,
- local_data_cache_size, shared_data_cache_size);
-
- int rows_rectangularness_log2 = 0;
- int cols_rectangularness_log2 = 0;
- GetRectangularness(rows, cols, kernel_rows, kernel_cols,
- &rows_rectangularness_log2, &cols_rectangularness_log2);
-
- const int kernel_rows_log2 = pot_log2(kernel_rows);
- const int kernel_cols_log2 = pot_log2(kernel_cols);
- const int kernel_size_log2 = std::max(kernel_cols_log2, kernel_rows_log2);
-
- const int size = std::min(rows, cols);
- const int size_log2 = std::max(kernel_size_log2, floor_log2(size));
-
- RUY_DCHECK_GE(size_log2, kernel_size_log2);
-
- // We are going to try candidate values for block_size_log2 ranging from
- // kernel_size_log2 to (kernel_size_log2 + kMaxKernelsPerBlockLog2).
- // For each of them we will compute a 'score' by adding individual scores
- // for a few different considerations, all of which is entirely empirical.
- // The values (and possibly the logic) around here are all subject to tuning
- // based on benchmarks on different hardware. The current values are based
- // on benchmarking on Qualcomm S855 (big and little cores), arm64,
- // kNeonDotprod, 8bit quantized path. Don't read too much into it, go ahead
- // and tune this as needed to achieve good performance elsewhere. Use
- // the unit test, block_map_test, to encode values that should be preserved
- // on specific architectures. Use RUY_MAKEBLOCKMAP_DEBUG to help tuning this.
- static constexpr int kMaxKernelsPerBlockLog2 = 6;
- const int max_block_size_log2 =
- std::min(size_log2, kernel_size_log2 + kMaxKernelsPerBlockLog2);
- int best_score = std::numeric_limits::min();
- int best_score_block_size_log2 = -1;
- for (int block_size_log2 = kernel_size_log2;
- block_size_log2 <= max_block_size_log2; block_size_log2++) {
- const int multithreading_score = GetMultithreadingScore(
- block_size_log2, rows, cols, tentative_thread_count);
- const int cache_locality_score = GetCacheLocalityScore(
- block_size_log2, rows, cols, depth, kernel_rows_log2, kernel_cols_log2,
- lhs_scalar_size, rhs_scalar_size, path, local_data_cache_size);
- const int kernel_amortization_score = GetKernelAmortizationScore(
- block_size_log2, rows, cols, kernel_rows_log2, kernel_cols_log2);
- const int score =
- multithreading_score + cache_locality_score + kernel_amortization_score;
-#ifdef RUY_MAKEBLOCKMAP_DEBUG
- if (firsttime || debug_everytime) {
- fprintf(stderr,
- "block_size_log2=%d: score=%d multithreading_score=%d "
- "cache_locality_score=%d kernel_amortization_score=%d\n",
- block_size_log2, score, multithreading_score,
- cache_locality_score, kernel_amortization_score);
- }
-#endif
- if (score >= best_score) {
- best_score = score;
- best_score_block_size_log2 = block_size_log2;
- }
- }
-
-#ifdef RUY_MAKEBLOCKMAP_DEBUG
- if (firsttime || debug_everytime) {
- fprintf(stderr, "best_score_block_size_log2=%d\n",
- best_score_block_size_log2);
- }
-
- static const char* explicit_block_size_log2_env =
- getenv("RUY_MAKEBLOCKMAP_EXPLICIT_BLOCK_SIZE_LOG2");
- if (explicit_block_size_log2_env) {
- best_score_block_size_log2 = std::stoi(explicit_block_size_log2_env);
- if (firsttime || debug_everytime) {
- fprintf(stderr, "Overridden best_score_block_size_log2=%d\n",
- best_score_block_size_log2);
- }
- }
- firsttime = false;
-#endif
-
- int num_blocks_base_log2 = size_log2 - best_score_block_size_log2;
- RUY_DCHECK_GE(num_blocks_base_log2, 0);
-
- const int num_blocks_of_rows_log2 =
- num_blocks_base_log2 + rows_rectangularness_log2;
- const int num_blocks_of_cols_log2 =
- num_blocks_base_log2 + cols_rectangularness_log2;
-
- const int smallr =
- round_down_pot(rows >> num_blocks_of_rows_log2, kernel_rows);
- const int smallc =
- round_down_pot(cols >> num_blocks_of_cols_log2, kernel_cols);
- const int missr =
- round_up_pot(rows - (smallr << num_blocks_of_rows_log2), kernel_rows) >>
- pot_log2(kernel_rows);
- const int missc =
- round_up_pot(cols - (smallc << num_blocks_of_cols_log2), kernel_cols) >>
- pot_log2(kernel_cols);
-
- block_map->dims[Side::kLhs] = rows;
- block_map->dims[Side::kRhs] = cols;
- block_map->kernel_dims[Side::kLhs] = kernel_rows;
- block_map->kernel_dims[Side::kRhs] = kernel_cols;
- block_map->num_blocks_base_log2 = num_blocks_base_log2;
- block_map->rectangularness_log2[Side::kLhs] = rows_rectangularness_log2;
- block_map->rectangularness_log2[Side::kRhs] = cols_rectangularness_log2;
- block_map->small_block_dims[Side::kLhs] = smallr;
- block_map->small_block_dims[Side::kRhs] = smallc;
- block_map->large_blocks[Side::kLhs] = missr;
- block_map->large_blocks[Side::kRhs] = missc;
- // Done last: NumBlocks needs some of the block_map fields to be already set.
- block_map->thread_count =
- std::min(tentative_thread_count, NumBlocks(*block_map));
-}
-
-void GetBlockMatrixCoords(Side side, const BlockMap& block_map, int block,
- int* start, int* end) {
- profiler::ScopeLabel label("GetBlockMatrixCoords");
- *start = block * block_map.small_block_dims[side] +
- std::min(block, block_map.large_blocks[side]) *
- block_map.kernel_dims[side];
- *end =
- *start + block_map.small_block_dims[side] +
- (block < block_map.large_blocks[side] ? block_map.kernel_dims[side] : 0);
-
- RUY_DCHECK_EQ(0, *start % block_map.kernel_dims[side]);
- RUY_DCHECK_EQ(0, *end % block_map.kernel_dims[side]);
- RUY_DCHECK_LE(*end, block_map.dims[side]);
- RUY_DCHECK_LT(*start, *end);
- RUY_DCHECK_GE(*start, 0);
-}
-
-void GetBlockMatrixCoords(const BlockMap& block_map, const SidePair& block,
- SidePair* start, SidePair* end) {
- for (Side side : {Side::kLhs, Side::kRhs}) {
- GetBlockMatrixCoords(side, block_map, block[side], &(*start)[side],
- &(*end)[side]);
- }
-}
-
-} // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/ruy/block_map.h b/tensorflow/lite/experimental/ruy/ruy/block_map.h
deleted file mode 100644
index 0fa4c9d5d60..00000000000
--- a/tensorflow/lite/experimental/ruy/ruy/block_map.h
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_BLOCK_MAP_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_BLOCK_MAP_H_
-
-#include "tensorflow/lite/experimental/ruy/ruy/path.h"
-#include "tensorflow/lite/experimental/ruy/ruy/side_pair.h"
-
-namespace ruy {
-
-enum class BlockMapTraversalOrder {
- // Plain old row-by-row or column-by-column traversal.
- kLinear,
- // Fractal Z-order curve, https://en.wikipedia.org/wiki/Z-order_curve
- kFractalZ,
- // Variant of Z-order doing a U instead of a Z.
- kFractalU,
- // Hilbert curve, https://en.wikipedia.org/wiki/Hilbert_curve
- kFractalHilbert
-};
-
-// A BlockMap describes a tiling of a matrix, typically the destination matrix
-// of a matrix multiplication computation. As is standard in matrix
-// multiplication, a tile is called a "block".
-//
-// Ruy subdivides work by blocks of the destination matrix: each thread fully
-// computes a block at once, then moves on to another block; each block is
-// produced by a single thread.
-//
-// This ensures that the workloads for each block are mutually independent,
-// which reduces synchronization requirements.
-//
-// Typically, a matrix multiplication will early on create a BlockMap by
-// calling MakeBlockMap. It will then query the number of blocks in that
-// BlockMap by calling NumBlocks. It will then create a single atomic integer
-// counter indexing these blocks, called the 'index', and will distribute
-// work to its N threads by ensuring that each thread works on disjoint sets
-// of index values. For a given index value, the thread will call
-// GetBlockByIndex to get the corresponding block, then GetBlockMatrixCoords
-// to find the actual row and column numbers of this block.
-//
-// There are two nested levels of subdivision. On a local level, the matrix is
-// tiled into a square NxN grid where N is a power of two, specifically:
-// N = 2^num_blocks_base_log2.
-//
-// At a larger scale, around these blocks, there may be one further
-// level of subdivision, in only one dimension: either along rows or along
-// columns. That is used to handle arbitrarily rectangular matrices. The
-// aforementioned high-level block grid is square, so it does not readily fit
-// well very rectangular matrices.
-//
-// Taking together these two nested levels of subdivision, the effective
-// tiling is by
-// 2^(num_blocks_base_log2 + rows_rectangularness_log2)
-// blocks in the row dimension, and by
-// 2^(num_blocks_base_log2 + cols_rectangularness_log2)
-// blocks in the column dimension. See NumBlocksOfRows, NumBlocksOfCols.
-//
-// Either rows_rectangularness_log2 or cols_rectangularness_log2 must be zero.
-//
-// Finally, this BlockMap is designed to operate under alignment constraints:
-// two fields, kernel_rows and kernel_cols, describe the requested alignment
-// of the effective grid in both dimensions. The idea is to feed matrix
-// multiplication kernels with tiles that fit their width as much as possible.
-// Of course, if rows (resp. cols) is not a multiple of kernel_rows (resp.
-// kernel_cols) then some tile will have to have unaligned size. BlockMap
-// will only allow that to happen in the last position along each axis, so
-// as to minimize the overhead incurred onto the matrix multiplication kernels.
-struct BlockMap {
- // The number of threads to use (to distribute the blocks to).
- int thread_count;
- // The order in which to traverse the matrix of which this BlockMap represents
- // a tiling (hereafter "the matrix").
- BlockMapTraversalOrder traversal_order;
- // The dimensions of the block_map, that is, of the destination
- // matrix rounded up to next multiples of kernel_dims.
- SidePair dims;
- // Log2 of the minimum number of subdivisions of the grid along either axis.
- int num_blocks_base_log2;
- // Log2 of the additional subdivision of the rows/columns axis.
- SidePair rectangularness_log2;
- // Requested alignment of the subdivisions of the grid along the rows/columns
- // axis.
- SidePair kernel_dims;
- // Internal helper. Minimum number of rows/columns in each block.
- SidePair small_block_dims;
- // Internal helper. Number of blocks along each dimension that need to have
- // their size in that dimension be given by (small_block_dims + kernel_dims)
- // instead of just small_block_dims.
- SidePair large_blocks;
-};
-
-// Returns the traversal order to be used for the given matrix multiplication
-// parameters.
-BlockMapTraversalOrder GetTraversalOrder(int rows, int cols, int depth,
- int lhs_scalar_size,
- int rhs_scalar_size,
- int local_data_cache_size,
- int shared_data_cache_size);
-
-// Create a BlockMap suitable for tiling the destination matrix in a
-// matrix multiplication with the given parameters.
-void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
- int kernel_cols, int lhs_scalar_size, int rhs_scalar_size,
- int tentative_thread_count, Path path,
- int local_data_cache_size, int shared_data_cache_size,
- BlockMap* block_map);
-
-// Maps an integer index to a block position in the grid.
-void GetBlockByIndex(const BlockMap& block_map, int index,
- SidePair* block);
-
-// Given a block position in the grid, returns its actual
-// position in the matrix that the BlockMap refers to in the dimension
-// referred to by `side`: along rows if side==kLhs, along columns if
-// side==kRhs.
-void GetBlockMatrixCoords(Side side, const BlockMap& block_map, int block,
- int* start, int* end);
-
-// Given a block position in the grid, returns its actual
-// position in the matrix that the BlockMap refers to in terms of
-// actual row/column indices.
-void GetBlockMatrixCoords(const BlockMap& block_map, const SidePair& block,
- SidePair* start, SidePair* end);
-
-// Returns the number of grid subdivisions along the rows dimension (if
-// side == kLhs) or columns dimension (if side == kRhs).
-inline int NumBlocksPerSide(Side side, const BlockMap& block_map) {
- return 1 << (block_map.num_blocks_base_log2 +
- block_map.rectangularness_log2[side]);
-}
-
-// Returns the overall number of blocks in
-// the BlockMap. The valid index values to pass to GetBlockByIndex are the
-// integers from 0 to N-1 where N is the value returned here.
-//
-// Note that it is always true that
-// NumBlocks == NumBlocksOfRows * NumBlocksOfCols
-// because either rows_rectangularness_log2 or cols_rectangularness_log2 is 0.
-inline int NumBlocks(const BlockMap& block_map) {
- return 1 << (2 * block_map.num_blocks_base_log2 +
- block_map.rectangularness_log2[Side::kLhs] +
- block_map.rectangularness_log2[Side::kRhs]);
-}
-
-} // namespace ruy
-
-#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_BLOCK_MAP_H_
diff --git a/tensorflow/lite/experimental/ruy/ruy/block_map_test.cc b/tensorflow/lite/experimental/ruy/ruy/block_map_test.cc
deleted file mode 100644
index cdd7ee0e01f..00000000000
--- a/tensorflow/lite/experimental/ruy/ruy/block_map_test.cc
+++ /dev/null
@@ -1,263 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/experimental/ruy/ruy/block_map.h"
-
-#include
-#include
-#include
-#include
-#include
-
-#include
-#include "tensorflow/lite/experimental/ruy/ruy/cpu_cache_size.h"
-#include "tensorflow/lite/experimental/ruy/ruy/path.h"
-#include "tensorflow/lite/experimental/ruy/ruy/side_pair.h"
-
-namespace ruy {
-namespace {
-
-#if RUY_PLATFORM(NEON_64)
-
-// Unless otherwise specified, these tests have been tuned on ARM Cortex-A55.
-void MakeBlockMapTuningTest(int rows, int cols, int depth, int kernel_rows,
- int kernel_cols, int lhs_scalar_size,
- int rhs_scalar_size, int tentative_thread_count,
- Path path, int expected_num_blocks_base_log2,
- int expected_rectangularness_log2) {
- BlockMap block_map;
- MakeBlockMap(rows, cols, depth, kernel_rows, kernel_cols, lhs_scalar_size,
- rhs_scalar_size, tentative_thread_count, path,
- LocalDataCacheSize(path), SharedDataCacheSize(path), &block_map);
- EXPECT_EQ(block_map.num_blocks_base_log2, expected_num_blocks_base_log2);
- EXPECT_EQ(std::min(block_map.rectangularness_log2[Side::kLhs],
- block_map.rectangularness_log2[Side::kRhs]),
- 0);
- EXPECT_EQ(std::max(block_map.rectangularness_log2[Side::kLhs],
- block_map.rectangularness_log2[Side::kRhs]),
- expected_rectangularness_log2);
-}
-
-TEST(BlockMapTest, MakeBlockMapTuningTest8bitCubicShapesOneThreadNeonDotprod) {
- MakeBlockMapTuningTest(32, 32, 32, 8, 8, 1, 1, /* tentative_thread_count */ 1,
- Path::kNeonDotprod,
- /* expected_num_blocks_base_log2 */ 0,
- /* expected_rectangularness_log2 */ 0);
- MakeBlockMapTuningTest(48, 48, 48, 8, 8, 1, 1, /* tentative_thread_count */ 1,
- Path::kNeonDotprod,
- /* expected_num_blocks_base_log2 */ 0,
- /* expected_rectangularness_log2 */ 0);
- MakeBlockMapTuningTest(64, 64, 64, 8, 8, 1, 1, /* tentative_thread_count */ 1,
- Path::kNeonDotprod,
- /* expected_num_blocks_base_log2 */ 0,
- /* expected_rectangularness_log2 */ 0);
- MakeBlockMapTuningTest(96, 96, 96, 8, 8, 1, 1, /* tentative_thread_count */ 1,
- Path::kNeonDotprod,
- /* expected_num_blocks_base_log2 */ 0,
- /* expected_rectangularness_log2 */ 0);
- MakeBlockMapTuningTest(128, 128, 128, 8, 8, 1, 1,
- /* tentative_thread_count */ 1, Path::kNeonDotprod,
- /* expected_num_blocks_base_log2 */ 0,
- /* expected_rectangularness_log2 */ 0);
- MakeBlockMapTuningTest(192, 192, 192, 8, 8, 1, 1,
- /* tentative_thread_count */ 1, Path::kNeonDotprod,
- /* expected_num_blocks_base_log2 */ 0,
- /* expected_rectangularness_log2 */ 0);
- MakeBlockMapTuningTest(256, 256, 256, 8, 8, 1, 1,
- /* tentative_thread_count */ 1, Path::kNeonDotprod,
- /* expected_num_blocks_base_log2 */ 1,
- /* expected_rectangularness_log2 */ 0);
- MakeBlockMapTuningTest(384, 384, 384, 8, 8, 1, 1,
- /* tentative_thread_count */ 1, Path::kNeonDotprod,
- /* expected_num_blocks_base_log2 */ 1,
- /* expected_rectangularness_log2 */ 0);
-}
-
-TEST(BlockMapTest,
- MakeBlockMapTuningTest8bitCubicShapesFourThreadsNeonDotprod) {
- MakeBlockMapTuningTest(32, 32, 32, 8, 8, 1, 1, /* tentative_thread_count */ 4,
- Path::kNeonDotprod,
- /* expected_num_blocks_base_log2 */ 1,
- /* expected_rectangularness_log2 */ 0);
- MakeBlockMapTuningTest(48, 48, 48, 8, 8, 1, 1, /* tentative_thread_count */ 4,
- Path::kNeonDotprod,
- /* expected_num_blocks_base_log2 */ 1,
- /* expected_rectangularness_log2 */ 0);
- MakeBlockMapTuningTest(64, 64, 64, 8, 8, 1, 1, /* tentative_thread_count */ 4,
- Path::kNeonDotprod,
- /* expected_num_blocks_base_log2 */ 1,
- /* expected_rectangularness_log2 */ 0);
- MakeBlockMapTuningTest(96, 96, 96, 8, 8, 1, 1, /* tentative_thread_count */ 4,
- Path::kNeonDotprod,
- /* expected_num_blocks_base_log2 */ 1,
- /* expected_rectangularness_log2 */ 0);
- MakeBlockMapTuningTest(128, 128, 128, 8, 8, 1, 1,
- /* tentative_thread_count */ 4, Path::kNeonDotprod,
- /* expected_num_blocks_base_log2 */ 1,
- /* expected_rectangularness_log2 */ 0);
- MakeBlockMapTuningTest(192, 192, 192, 8, 8, 1, 1,
- /* tentative_thread_count */ 4, Path::kNeonDotprod,
- /* expected_num_blocks_base_log2 */ 1,
- /* expected_rectangularness_log2 */ 0);
- MakeBlockMapTuningTest(256, 256, 256, 8, 8, 1, 1,
- /* tentative_thread_count */ 4, Path::kNeonDotprod,
- /* expected_num_blocks_base_log2 */ 2,
- /* expected_rectangularness_log2 */ 0);
- MakeBlockMapTuningTest(384, 384, 384, 8, 8, 1, 1,
- /* tentative_thread_count */ 4, Path::kNeonDotprod,
- /* expected_num_blocks_base_log2 */ 2,
- /* expected_rectangularness_log2 */ 0);
-}
-
-TEST(BlockMapTest, MakeBlockMapTuningTest32bit) {
- MakeBlockMapTuningTest(256, 256, 256, 8, 8, 4, 4,
- /* tentative_thread_count */ 4, Path::kNeonDotprod,
- /* expected_num_blocks_base_log2 */ 3,
- /* expected_rectangularness_log2 */ 0);
- MakeBlockMapTuningTest(4096, 4096, 4096, 8, 8, 4, 4,
- /* tentative_thread_count */ 4, Path::kNeonDotprod,
- /* expected_num_blocks_base_log2 */ 7,
- /* expected_rectangularness_log2 */ 0);
-}
-
-TEST(BlockMapTest, MakeBlockMapTuningTestRectangular) {
- MakeBlockMapTuningTest(256, 16, 256, 8, 8, 1, 1,
- /* tentative_thread_count */ 1, Path::kNeonDotprod,
- /* expected_num_blocks_base_log2 */ 0,
- /* expected_rectangularness_log2 */ 3);
- MakeBlockMapTuningTest(24, 2400, 256, 8, 8, 1, 1,
- /* tentative_thread_count */ 1, Path::kNeonDotprod,
- /* expected_num_blocks_base_log2 */ 0,
- /* expected_rectangularness_log2 */ 6);
-}
-
-#endif
-
-int L1Distance(const SidePair& a, const SidePair& b) {
- return std::abs(a[Side::kLhs] - b[Side::kLhs]) +
- std::abs(a[Side::kRhs] - b[Side::kRhs]);
-}
-
-void GetBlockByIndexSquareTest(int num_blocks_base_log2,
- BlockMapTraversalOrder traversal_order) {
- // Arbitrary, does not affect this test. 3 is just a typical value.
- constexpr int kKernelSizeLog2 = 3;
-
- const int size_log2 = num_blocks_base_log2 + kKernelSizeLog2;
- BlockMap block_map;
- block_map.thread_count = 1;
- block_map.traversal_order = traversal_order;
- block_map.num_blocks_base_log2 = num_blocks_base_log2;
- for (Side side : {Side::kLhs, Side::kRhs}) {
- block_map.dims[side] = 1 << size_log2;
- block_map.rectangularness_log2[side] = 0;
- block_map.kernel_dims[side] = 1 << kKernelSizeLog2;
- block_map.small_block_dims[side] = block_map.kernel_dims[side];
- block_map.large_blocks[side] = 0;
- }
-
- const int num_blocks_per_side = 1 << num_blocks_base_log2;
- const int num_blocks = num_blocks_per_side * num_blocks_per_side;
- EXPECT_EQ(num_blocks, NumBlocks(block_map));
-
- // Perform a full traversal of all blocks, as if computing a whole matrix
- // multiplication.
- //
- // Used to record how many times each block was hit by the traversal.
- std::vector block_hit_counts(num_blocks);
- // Here we guard an assumption that all traversal orders start at (0, 0).
- SidePair previous_block_coords(0, 0);
- // Sum of L1 norm of the coordinate change at every step of the traversal.
- std::int64_t total_l1_distance = 0;
- // Number of jumps i.e. traversal steps with a L1 norm greater than 1.
- int discontinuity_count = 0;
- for (int block_index = 0; block_index < num_blocks; block_index++) {
- SidePair block_coords;
- GetBlockByIndex(block_map, block_index, &block_coords);
- ++block_hit_counts[block_coords[Side::kLhs] +
- num_blocks_per_side * block_coords[Side::kRhs]];
- int distance = L1Distance(block_coords, previous_block_coords);
- total_l1_distance += distance;
- discontinuity_count += (distance > 1);
- previous_block_coords = block_coords;
- }
-
- // Verify that each block was traversed exactly once.
- for (int l = 0; l < num_blocks_per_side; l++) {
- for (int r = 0; r < num_blocks_per_side; r++) {
- EXPECT_EQ(block_hit_counts[l + num_blocks_per_side * r], 1);
- }
- }
-
- // Verify that the discontinuity_count and total_l1_distance are as expected
- // for the given traversal_order.
- switch (traversal_order) {
- case BlockMapTraversalOrder::kFractalHilbert:
- // No discontinuity at all with this space-filling continuous curve!
- EXPECT_EQ(discontinuity_count, 0);
- // Therefore, total_l1_distance has to be the number of blocks minus one.
- EXPECT_EQ(total_l1_distance, num_blocks - 1);
- break;
- case BlockMapTraversalOrder::kLinear:
- EXPECT_EQ(discontinuity_count, num_blocks_per_side - 1);
- EXPECT_EQ(total_l1_distance,
- 2 * num_blocks_per_side * (num_blocks_per_side - 1));
- break;
- case BlockMapTraversalOrder::kFractalZ:
- EXPECT_EQ(discontinuity_count, num_blocks > 1 ? (num_blocks / 2 - 1) : 0);
- EXPECT_EQ(total_l1_distance,
- 2 * num_blocks_per_side * (num_blocks_per_side - 1));
- break;
- case BlockMapTraversalOrder::kFractalU: {
- if (num_blocks_base_log2 == 0) {
- EXPECT_EQ(discontinuity_count, 0);
- EXPECT_EQ(total_l1_distance, 0);
- } else {
- int expected_discontinuity_count = 0;
- int expected_total_l1_distance = 3;
- for (int i = 2; i <= num_blocks_base_log2; i++) {
- expected_discontinuity_count = 4 * expected_discontinuity_count + 2;
- expected_total_l1_distance =
- 4 * expected_total_l1_distance + (1 << (i + 1)) - 1;
- }
- EXPECT_EQ(discontinuity_count, expected_discontinuity_count);
- EXPECT_EQ(total_l1_distance, expected_total_l1_distance);
- }
- break;
- }
- default:
- abort();
- }
-}
-
-TEST(BlockMapTest, GetBlockByIndexSquare) {
- for (int num_blocks_base_log2 = 0; num_blocks_base_log2 <= 10;
- num_blocks_base_log2++) {
- for (BlockMapTraversalOrder traversal_order :
- {BlockMapTraversalOrder::kLinear, BlockMapTraversalOrder::kFractalZ,
- BlockMapTraversalOrder::kFractalU,
- BlockMapTraversalOrder::kFractalHilbert}) {
- GetBlockByIndexSquareTest(num_blocks_base_log2, traversal_order);
- }
- }
-}
-
-} // namespace
-} // namespace ruy
-
-int main(int argc, char **argv) {
- ::testing::InitGoogleTest(&argc, argv);
- return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/experimental/ruy/ruy/blocking_counter.cc b/tensorflow/lite/experimental/ruy/ruy/blocking_counter.cc
deleted file mode 100644
index d313ffce51b..00000000000
--- a/tensorflow/lite/experimental/ruy/ruy/blocking_counter.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/experimental/ruy/ruy/blocking_counter.h"
-
-#include "tensorflow/lite/experimental/ruy/ruy/check_macros.h"
-#include "tensorflow/lite/experimental/ruy/ruy/wait.h"
-
-namespace ruy {
-
-void BlockingCounter::Reset(int initial_count) {
- int old_count_value = count_.load(std::memory_order_relaxed);
- RUY_DCHECK_EQ(old_count_value, 0);
- (void)old_count_value;
- count_.store(initial_count, std::memory_order_release);
-}
-
-bool BlockingCounter::DecrementCount() {
- int old_count_value = count_.fetch_sub(1, std::memory_order_acq_rel);
- RUY_DCHECK_GT(old_count_value, 0);
- int count_value = old_count_value - 1;
- bool hit_zero = (count_value == 0);
- if (hit_zero) {
- std::lock_guard lock(count_mutex_);
- count_cond_.notify_all();
- }
- return hit_zero;
-}
-
-void BlockingCounter::Wait() {
- const auto& condition = [this]() {
- return count_.load(std::memory_order_acquire) == 0;
- };
- ruy::Wait(condition, &count_cond_, &count_mutex_);
-}
-
-} // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/ruy/blocking_counter.h b/tensorflow/lite/experimental/ruy/ruy/blocking_counter.h
deleted file mode 100644
index 878f0e7219e..00000000000
--- a/tensorflow/lite/experimental/ruy/ruy/blocking_counter.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_BLOCKING_COUNTER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_BLOCKING_COUNTER_H_
-
-#include
-#include // NOLINT(build/c++11) // IWYU pragma: keep
-#include // NOLINT(build/c++11) // IWYU pragma: keep
-
-namespace ruy {
-
-// A BlockingCounter lets one thread to wait for N events to occur.
-// This is how the master thread waits for all the worker threads
-// to have finished working.
-// The waiting is done using a naive spinlock waiting for the atomic
-// count_ to hit the value 0. This is acceptable because in our usage
-// pattern, BlockingCounter is used only to synchronize threads after
-// short-lived tasks (performing parts of the same GEMM). It is not used
-// for synchronizing longer waits (resuming work on the next GEMM).
-class BlockingCounter {
- public:
- BlockingCounter() : count_(0) {}
-
- // Sets/resets the counter; initial_count is the number of
- // decrementing events that the Wait() call will be waiting for.
- void Reset(int initial_count);
-
- // Decrements the counter; if the counter hits zero, signals
- // the threads that were waiting for that, and returns true.
- // Otherwise (if the decremented count is still nonzero),
- // returns false.
- bool DecrementCount();
-
- // Waits for the N other threads (N having been set by Reset())
- // to hit the BlockingCounter.
- void Wait();
-
- private:
- std::atomic count_;
-
- // The condition variable and mutex allowing to passively wait for count_
- // to reach the value zero, in the case of longer waits.
- std::condition_variable count_cond_;
- std::mutex count_mutex_;
-};
-
-} // namespace ruy
-
-#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_BLOCKING_COUNTER_H_
diff --git a/tensorflow/lite/experimental/ruy/ruy/build_defs.bzl b/tensorflow/lite/experimental/ruy/ruy/build_defs.bzl
deleted file mode 100644
index 9bccccf6316..00000000000
--- a/tensorflow/lite/experimental/ruy/ruy/build_defs.bzl
+++ /dev/null
@@ -1,40 +0,0 @@
-"""Build definitions for Ruy."""
-
-# 1. Enable -mfpu=neon unconditionally on ARM32. If it turns out that we need to support
-# ARM32 without NEON then we'll implement runtime detection and dispatch at that point.
-# 2. Explicitly pass -O3 on optimization configs where just "-c opt" means "optimize for code size".
-
-def ruy_copts_base():
- return select({
- ":armeabi-v7a": [
- "-mfpu=neon",
- ],
- "//conditions:default": [],
- }) + select({
- ":optimized": ["-O3"],
- "//conditions:default": [],
- })
-
-# Used for targets that are compiled with extra features that are skipped at runtime if unavailable.
-def ruy_copts_skylake():
- return []
-
-# Used for targets that are compiled with extra features that are skipped at runtime if unavailable.
-def ruy_copts_avx2():
- return []
-
-# TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-# Optimization is not finished. In particular the dimensions of the kernel
-# blocks can be changed as desired.
-#
-# Used for targets that are compiled with extra features that are skipped at runtime if unavailable.
-def ruy_copts_sse42():
- return []
-
-# TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-# Optimization is not finished. In particular the dimensions of the kernel
-# blocks can be changed as desired.
-#
-# Used for targets that are compiled with extra features that are skipped at runtime if unavailable.
-def ruy_copts_avxvnni():
- return []
diff --git a/tensorflow/lite/experimental/ruy/ruy/check_macros.h b/tensorflow/lite/experimental/ruy/ruy/check_macros.h
deleted file mode 100644
index 773f37d99f2..00000000000
--- a/tensorflow/lite/experimental/ruy/ruy/check_macros.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_CHECK_MACROS_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_CHECK_MACROS_H_
-
-#include
-#include
-#include
-
-namespace ruy {
-namespace check_macros {
-
-constexpr int kValueBufSize = 32;
-
-template
-struct ToString {
- static void Run(const T& value, char* buf) {
- snprintf(buf, kValueBufSize, "(?)");
- }
-};
-
-template <>
-struct ToString {
- static void Run(float value, char* buf) {
- snprintf(buf, kValueBufSize, "%.9g", static_cast(value));
- }
-};
-
-template <>
-struct ToString {
- static void Run(double value, char* buf) {
- snprintf(buf, kValueBufSize, "%.16g", value);
- }
-};
-
-template
-struct ToString::value>::type> {
- static void Run(const T& value, char* buf) {
- snprintf(buf, kValueBufSize, "%lld", static_cast(value));
- }
-};
-
-template
-struct ToString {
- static void Run(T* value, char* buf) {
- snprintf(buf, kValueBufSize, "%p", value);
- }
-};
-
-template
-struct ToString::value>::type> {
- static void Run(const T& value, char* buf) {
- snprintf(buf, kValueBufSize, "(enum value %d)", static_cast(value));
- }
-};
-
-inline void Failure(const char* file, int line, const char* macro,
- const char* condition) {
- fprintf(stderr, "%s:%d: %s condition not satisfied: %s\n", file, line, macro,
- condition);
- abort();
-}
-
-template
-inline void Failure(const char* file, int line, const char* macro,
- const char* lhs, const LhsType& lhs_value, const char* op,
- const char* rhs, const RhsType& rhs_value) {
- char lhs_value_buf[kValueBufSize];
- ToString::Run(lhs_value, lhs_value_buf);
- char rhs_value_buf[kValueBufSize];
- ToString::Run(rhs_value, rhs_value_buf);
- fprintf(stderr,
- "%s:%d: %s condition not satisfied: [ %s %s %s ] with values [ "
- "%s %s %s ].\n",
- file, line, macro, lhs, op, rhs, lhs_value_buf, op, rhs_value_buf);
- abort();
-}
-
-#define RUY_CHECK_IMPL(macro, condition) \
- do { \
- if (!(condition)) { \
- ruy::check_macros::Failure(__FILE__, __LINE__, #macro, #condition); \
- } \
- } while (false)
-
-#define RUY_CHECK_OP_IMPL(macro, lhs, op, rhs) \
- do { \
- const auto& lhs_value = (lhs); \
- const auto& rhs_value = (rhs); \
- if (!(lhs_value op rhs_value)) { \
- ruy::check_macros::Failure(__FILE__, __LINE__, #macro, #lhs, lhs_value, \
- #op, #rhs, rhs_value); \
- } \
- } while (false)
-
-#define RUY_CHECK(condition) RUY_CHECK_IMPL(RUY_CHECK, condition)
-#define RUY_CHECK_EQ(x, y) RUY_CHECK_OP_IMPL(RUY_CHECK_EQ, x, ==, y)
-#define RUY_CHECK_NE(x, y) RUY_CHECK_OP_IMPL(RUY_CHECK_NE, x, !=, y)
-#define RUY_CHECK_GE(x, y) RUY_CHECK_OP_IMPL(RUY_CHECK_GE, x, >=, y)
-#define RUY_CHECK_GT(x, y) RUY_CHECK_OP_IMPL(RUY_CHECK_GT, x, >, y)
-#define RUY_CHECK_LE(x, y) RUY_CHECK_OP_IMPL(RUY_CHECK_LE, x, <=, y)
-#define RUY_CHECK_LT(x, y) RUY_CHECK_OP_IMPL(RUY_CHECK_LT, x, <, y)
-
-#ifdef NDEBUG
-#define RUY_DCHECK(condition)
-#define RUY_DCHECK_EQ(x, y)
-#define RUY_DCHECK_NE(x, y)
-#define RUY_DCHECK_GE(x, y)
-#define RUY_DCHECK_GT(x, y)
-#define RUY_DCHECK_LE(x, y)
-#define RUY_DCHECK_LT(x, y)
-#else
-#define RUY_DCHECK(condition) RUY_CHECK(condition)
-#define RUY_DCHECK_EQ(x, y) RUY_CHECK_EQ(x, y)
-#define RUY_DCHECK_NE(x, y) RUY_CHECK_NE(x, y)
-#define RUY_DCHECK_GE(x, y) RUY_CHECK_GE(x, y)
-#define RUY_DCHECK_GT(x, y) RUY_CHECK_GT(x, y)
-#define RUY_DCHECK_LE(x, y) RUY_CHECK_LE(x, y)
-#define RUY_DCHECK_LT(x, y) RUY_CHECK_LT(x, y)
-#endif
-
-} // end namespace check_macros
-} // end namespace ruy
-
-#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_CHECK_MACROS_H_
diff --git a/tensorflow/lite/experimental/ruy/ruy/check_macros_test.cc b/tensorflow/lite/experimental/ruy/ruy/check_macros_test.cc
deleted file mode 100644
index 1a2a5a238f2..00000000000
--- a/tensorflow/lite/experimental/ruy/ruy/check_macros_test.cc
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/experimental/ruy/ruy/check_macros.h"
-
-#include
-
-namespace {
-
-#define TEST_CONDITION_FOR_FAMILY(family, vacuously_succeeds, condition) \
- do { \
- if (vacuously_succeeds || (condition)) { \
- RUY_##family(condition); \
- } \
- } while (false)
-
-#define TEST_COMPARISON_FOR_FAMILY(family, vacuously_succeeds, op_name, x, op, \
- y) \
- do { \
- if (vacuously_succeeds || ((x)op(y))) { \
- RUY_##family##_##op_name(x, y); \
- } \
- } while (false)
-
-#ifdef NDEBUG
-#define TEST_CONDITION(condition) \
- do { \
- TEST_CONDITION_FOR_FAMILY(CHECK, false, condition); \
- } while (false)
-#define TEST_COMPARISON(op_name, x, op, y) \
- do { \
- TEST_COMPARISON_FOR_FAMILY(CHECK, false, op_name, x, op, y); \
- } while (false)
-#else
-#define TEST_CONDITION(condition) \
- do { \
- TEST_CONDITION_FOR_FAMILY(CHECK, false, condition); \
- TEST_CONDITION_FOR_FAMILY(DCHECK, false, condition); \
- } while (false)
-#define TEST_COMPARISON(op_name, x, op, y) \
- do { \
- TEST_COMPARISON_FOR_FAMILY(CHECK, false, op_name, x, op, y); \
- TEST_COMPARISON_FOR_FAMILY(DCHECK, false, op_name, x, op, y); \
- } while (false)
-
-#endif
-
-template
-void TestEqualityComparisons(const LhsType& lhs, const RhsType& rhs) {
- RUY_CHECK_EQ(lhs, lhs);
- TEST_COMPARISON(EQ, lhs, ==, lhs);
- RUY_CHECK_EQ(lhs, lhs);
- RUY_CHECK_EQ(lhs, lhs);
- if (lhs == rhs) {
- RUY_CHECK_EQ(lhs, rhs);
- }
- if (lhs != rhs) {
- RUY_CHECK_NE(lhs, rhs);
- }
-}
-
-template
-void TestComparisons(const LhsType& lhs, const RhsType& rhs) {
- TestEqualityComparisons(lhs, rhs);
- if (lhs > rhs) {
- RUY_CHECK_GT(lhs, rhs);
- }
- if (lhs >= rhs) {
- RUY_CHECK_GE(lhs, rhs);
- }
- if (lhs < rhs) {
- RUY_CHECK_LT(lhs, rhs);
- }
- if (lhs <= rhs) {
- RUY_CHECK_LE(lhs, rhs);
- }
-}
-
-TEST(CheckMacrosTest, IntInt) {
- TestComparisons(0, 0);
- TestComparisons(0, 1);
- TestComparisons(1, -1);
- TestComparisons(-1, 0);
- TestComparisons(123, -456);
- TestComparisons(std::numeric_limits::min(),
- std::numeric_limits::max());
- TestComparisons(123, std::numeric_limits::max());
- TestComparisons(123, std::numeric_limits::min());
-}
-
-TEST(CheckMacrosTest, Uint8Uint8) {
- TestComparisons(0, 0);
- TestComparisons(255, 0);
- TestComparisons(0, 255);
- TestComparisons(12, 34);
-}
-
-TEST(CheckMacrosTest, Uint8Int) {
- TestComparisons(0, std::numeric_limits::min());
- TestComparisons(255, std::numeric_limits::min());
- TestComparisons(0, std::numeric_limits::max());
- TestComparisons(255, std::numeric_limits::max());
-}
-
-TEST(CheckMacrosTest, FloatFloat) {
- TestComparisons(0.f, 0.f);
- TestComparisons(0.f, 1.f);
- TestComparisons(1.f, -1.f);
- TestComparisons(-1.f, 0.f);
- TestComparisons(123.f, -456.f);
- TestComparisons(std::numeric_limits::lowest(),
- std::numeric_limits::max());
- TestComparisons(123.f, std::numeric_limits::max());
- TestComparisons(123.f, std::numeric_limits::lowest());
-}
-
-TEST(CheckMacrosTest, IntFloat) {
- TestComparisons(0, 0.f);
- TestComparisons(0, 1.f);
- TestComparisons(1, -1.f);
- TestComparisons(-1, 0.f);
- TestComparisons(123, -456.f);
- TestComparisons(std::numeric_limits::lowest(),
- std::numeric_limits::max());
- TestComparisons(123, std::numeric_limits::max());
- TestComparisons(123, std::numeric_limits::lowest());
-}
-
-TEST(CheckMacrosTest, EnumClass) {
- enum class SomeEnumClass { kA, kB, kC };
- TestEqualityComparisons(SomeEnumClass::kA, SomeEnumClass::kA);
- TestEqualityComparisons(SomeEnumClass::kA, SomeEnumClass::kB);
- TestEqualityComparisons(SomeEnumClass::kC, SomeEnumClass::kB);
-}
-
-} // namespace
-
-int main(int argc, char** argv) {
- ::testing::InitGoogleTest(&argc, argv);
- return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/experimental/ruy/ruy/common.h b/tensorflow/lite/experimental/ruy/ruy/common.h
deleted file mode 100644
index e52a6ba6976..00000000000
--- a/tensorflow/lite/experimental/ruy/ruy/common.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Miscellaneous helpers internal library.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_COMMON_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_COMMON_H_
-
-#include
-#include
-
-#include "tensorflow/lite/experimental/ruy/ruy/check_macros.h"
-#include "tensorflow/lite/experimental/ruy/ruy/matrix.h"
-#include "tensorflow/lite/experimental/ruy/ruy/opt_set.h"
-#include "tensorflow/lite/experimental/ruy/ruy/path.h"
-#include "tensorflow/lite/experimental/ruy/ruy/platform.h"
-
-#if RUY_OPT_ENABLED(RUY_OPT_PREFETCH_LOAD)
-#define RUY_PREFETCH_LOAD(X) X
-#else
-#define RUY_PREFETCH_LOAD(X)
-#endif
-
-#if RUY_OPT_ENABLED(RUY_OPT_PREFETCH_STORE)
-#define RUY_PREFETCH_STORE(X) X
-#else
-#define RUY_PREFETCH_STORE(X)
-#endif
-
-#define RUY_STR(s) RUY_STR_UNEXPANDED(s)
-#define RUY_STR_UNEXPANDED(s) #s
-
-namespace ruy {
-
-// Helper for type-erasing a pointer.
-//
-// Often inside Ruy, a template parameter holds type information statically, but
-// we would like to have a function signature that doesn't depend on the
-// template parameters, so that we can dispatch indirectly across multiple
-// implementations. This helper is at the core of such type-erasure.
-//
-// The opposite of this operation is just `static_cast(void_ptr)`.
-template
-void* ToVoidPtr(T* p) {
- return const_cast(static_cast(p));
-}
-
-template
-Scalar SymmetricZeroPoint() {
- if (std::is_floating_point::value) {
- return 0;
- }
- if (std::is_signed::value) {
- return 0;
- }
- return std::numeric_limits::max() / 2 + 1;
-}
-
-} // namespace ruy
-
-#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_COMMON_H_
diff --git a/tensorflow/lite/experimental/ruy/ruy/context.cc b/tensorflow/lite/experimental/ruy/ruy/context.cc
deleted file mode 100644
index e0d4701645f..00000000000
--- a/tensorflow/lite/experimental/ruy/ruy/context.cc
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/experimental/ruy/ruy/context.h"
-
-#include "tensorflow/lite/experimental/ruy/ruy/check_macros.h"
-#include "tensorflow/lite/experimental/ruy/ruy/detect_arm.h"
-#include "tensorflow/lite/experimental/ruy/ruy/detect_x86.h"
-#include "tensorflow/lite/experimental/ruy/ruy/have_built_path_for.h"
-#include "tensorflow/lite/experimental/ruy/ruy/platform.h"
-
-namespace ruy {
-
-void Context::SetRuntimeEnabledPaths(Path paths) {
- runtime_enabled_paths_ = paths;
-}
-
-Path Context::GetRuntimeEnabledPaths() {
- // This function should always return the same value on a given machine.
- // When runtime_enabled_paths_ has its initial value kNone, it performs
- // some platform detection to resolve it to specific Path values.
-
- // Fast path: already resolved.
- if (runtime_enabled_paths_ != Path::kNone) {
- return runtime_enabled_paths_;
- }
-
- // Need to resolve now. Start by considering all paths enabled.
- runtime_enabled_paths_ = kAllPaths;
-
- // This mechanism is intended to be used for testing and benchmarking. For
- // example, one can set RUY_FORCE_DISABLE_PATHS to Path::kAvx512 in order to
- // evaluate AVX2 performance on an AVX-512 machine.
-#ifdef RUY_FORCE_DISABLE_PATHS
- runtime_enabled_paths_ = runtime_enabled_paths_ & ~(RUY_FORCE_DISABLE_PATHS);
-#endif
-
-#if RUY_PLATFORM(ARM)
- // Now selectively disable paths that aren't supported on this machine.
- if ((runtime_enabled_paths_ & Path::kNeonDotprod) != Path::kNone) {
- if (!DetectDotprod()) {
- runtime_enabled_paths_ = runtime_enabled_paths_ & ~Path::kNeonDotprod;
- // Sanity check.
- RUY_DCHECK((runtime_enabled_paths_ & Path::kNeonDotprod) == Path::kNone);
- }
- }
-#endif // RUY_PLATFORM(ARM)
-
-#if RUY_PLATFORM(X86)
- // TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete /
- // placeholder. Optimization is not finished. In particular the dimensions of
- // the kernel blocks can be changed as desired.
- //
- if ((runtime_enabled_paths_ & Path::kSse42) != Path::kNone) {
- if (!(HaveBuiltPathForSse42() && DetectCpuSse42())) {
- runtime_enabled_paths_ = runtime_enabled_paths_ & ~Path::kSse42;
- // Sanity check.
- RUY_DCHECK((runtime_enabled_paths_ & Path::kSse42) == Path::kNone);
- }
- }
-
- if ((runtime_enabled_paths_ & Path::kAvx2) != Path::kNone) {
- if (!(HaveBuiltPathForAvx2() && DetectCpuAvx2())) {
- runtime_enabled_paths_ = runtime_enabled_paths_ & ~Path::kAvx2;
- // Sanity check.
- RUY_DCHECK((runtime_enabled_paths_ & Path::kAvx2) == Path::kNone);
- }
- }
-
- if ((runtime_enabled_paths_ & Path::kAvx512) != Path::kNone) {
- if (!(HaveBuiltPathForAvx512() && DetectCpuAvx512())) {
- runtime_enabled_paths_ = runtime_enabled_paths_ & ~Path::kAvx512;
- // Sanity check.
- RUY_DCHECK((runtime_enabled_paths_ & Path::kAvx512) == Path::kNone);
- }
- }
-
- // TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete /
- // placeholder. Optimization is not finished. In particular the dimensions of
- // the kernel blocks can be changed as desired.
- //
- if ((runtime_enabled_paths_ & Path::kAvxVnni) != Path::kNone) {
- if (!(HaveBuiltPathForAvxVnni() && DetectCpuAvxVnni())) {
- runtime_enabled_paths_ = runtime_enabled_paths_ & ~Path::kAvxVnni;
- // Sanity check.
- RUY_DCHECK((runtime_enabled_paths_ & Path::kAvxVnni) == Path::kNone);
- }
- }
-#endif // RUY_PLATFORM(X86)
-
- // Sanity check. We can't possibly have disabled all paths, as some paths
- // are universally available (kReference, kStandardCpp).
- RUY_DCHECK_NE(runtime_enabled_paths_, Path::kNone);
- return runtime_enabled_paths_;
-}
-
-} // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/ruy/context.h b/tensorflow/lite/experimental/ruy/ruy/context.h
deleted file mode 100644
index a2d05a9ba5c..00000000000
--- a/tensorflow/lite/experimental/ruy/ruy/context.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_CONTEXT_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_CONTEXT_H_
-
-#include
-#include
-#include
-
-#include "tensorflow/lite/experimental/ruy/ruy/allocator.h"
-#include "tensorflow/lite/experimental/ruy/ruy/path.h"
-#include "tensorflow/lite/experimental/ruy/ruy/prepacked_cache.h"
-#include "tensorflow/lite/experimental/ruy/ruy/thread_pool.h"
-#include "tensorflow/lite/experimental/ruy/ruy/trace.h"
-#include "tensorflow/lite/experimental/ruy/ruy/tune.h"
-
-namespace ruy {
-
-// The state private to each Ruy thread.
-struct PerThreadState {
- // Each thread may be running on a different microarchitecture. For example,
- // some threads may be on big cores, while others are on little cores. Thus,
- // it's best for the tuning to be per-thread.
- TuningResolver tuning_resolver;
- // Each thread has its own local allocator.
- Allocator allocator;
-};
-
-// A Context holds runtime information used by Ruy. It holds runtime resources
-// such as the workers thread pool and the allocator (which holds buffers for
-// temporary data), as well as runtime options controlling which Paths are
-// enabled (typically based on which instruction sets are detected) and how
-// many threads to use.
-struct Context final {
- Path last_taken_path = Path::kNone;
- Tuning explicit_tuning = Tuning::kAuto;
- // TODO(benoitjacob) rename that thread_pool. Current name is gemmlowp legacy.
- ThreadPool workers_pool;
- int max_num_threads = 1;
- // State for each thread in the thread pool. Entry 0 is the main thread.
- std::vector> per_thread_states;
- TracingContext tracing;
- CachePolicy cache_policy = CachePolicy::kNoCache;
-
- Allocator* GetMainAllocator() {
- if (!main_allocator_) {
- main_allocator_.reset(new Allocator);
- }
- return main_allocator_.get();
- }
-
- PrepackedCache* GetPrepackedCache() {
- if (!prepacked_cache_) {
- prepacked_cache_.reset(new PrepackedCache);
- }
- return prepacked_cache_.get();
- }
-
- void ClearPrepackedCache() { prepacked_cache_ = nullptr; }
-
- void EnsureNPerThreadStates(int thread_count) {
- while (per_thread_states.size() < static_cast(thread_count)) {
- per_thread_states.emplace_back(new PerThreadState);
- }
- }
-
- Tuning GetMainThreadTuning() {
- EnsureNPerThreadStates(1);
- TuningResolver* tuning_resolver = &per_thread_states[0]->tuning_resolver;
- tuning_resolver->SetTuning(explicit_tuning);
- return tuning_resolver->Resolve();
- }
-
- template
- Path GetPathToTake() {
- last_taken_path =
- GetMostSignificantPath(CompiledPaths & GetRuntimeEnabledPaths());
- return last_taken_path;
- }
-
- void SetRuntimeEnabledPaths(Path paths);
- Path GetRuntimeEnabledPaths();
-
- private:
- // Allocator for main thread work before invoking the threadpool.
- // Our simple Allocator does not allow reserving/allocating more blocks
- // while it's already in committed state, so the main thread needs both
- // this allocator, and its per-thread allocator.
- std::unique_ptr main_allocator_;
- std::unique_ptr prepacked_cache_;
- Path runtime_enabled_paths_ = Path::kNone;
-};
-
-} // end namespace ruy
-
-#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_CONTEXT_H_
diff --git a/tensorflow/lite/experimental/ruy/ruy/context_test.cc b/tensorflow/lite/experimental/ruy/ruy/context_test.cc
deleted file mode 100644
index bddbfcf8c55..00000000000
--- a/tensorflow/lite/experimental/ruy/ruy/context_test.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/experimental/ruy/ruy/context.h"
-
-#include
-#include "tensorflow/lite/experimental/ruy/ruy/path.h"
-#include "tensorflow/lite/experimental/ruy/ruy/platform.h"
-
-namespace ruy {
-namespace {
-
-TEST(ContextTest, EnabledPathsGeneral) {
- ruy::Context ruy_context;
- const auto ruy_paths = ruy_context.GetRuntimeEnabledPaths();
- const auto ruy_paths_repeat = ruy_context.GetRuntimeEnabledPaths();
- ASSERT_EQ(ruy_paths, ruy_paths_repeat);
- EXPECT_NE(ruy_paths, Path::kNone);
- EXPECT_EQ(ruy_paths & Path::kReference, Path::kReference);
- EXPECT_EQ(ruy_paths & Path::kStandardCpp, Path::kStandardCpp);
-}
-
-#if RUY_PLATFORM(X86)
-TEST(ContextTest, EnabledPathsX86) {
- ruy::Context ruy_context;
- ruy_context.SetRuntimeEnabledPaths(Path::kSse42 | Path::kAvx2 |
- Path::kAvx512 | Path::kAvxVnni);
- const auto ruy_paths = ruy_context.GetRuntimeEnabledPaths();
- EXPECT_EQ(ruy_paths & Path::kReference, Path::kNone);
- EXPECT_EQ(ruy_paths & Path::kStandardCpp, Path::kNone);
-}
-#endif // RUY_PLATFORM(X86)
-
-#if RUY_PLATFORM(ARM)
-TEST(ContextTest, EnabledPathsArm) {
- ruy::Context ruy_context;
- ruy_context.SetRuntimeEnabledPaths(Path::kNeon | Path::kNeonDotprod);
- const auto ruy_paths = ruy_context.GetRuntimeEnabledPaths();
- EXPECT_EQ(ruy_paths & Path::kReference, Path::kNone);
- EXPECT_EQ(ruy_paths & Path::kStandardCpp, Path::kNone);
- EXPECT_EQ(ruy_paths & Path::kNeon, Path::kNeon);
-}
-#endif // RUY_PLATFORM(ARM)
-
-} // namespace
-} // namespace ruy
-
-int main(int argc, char** argv) {
- ::testing::InitGoogleTest(&argc, argv);
- return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/experimental/ruy/ruy/cpu_cache_size.h b/tensorflow/lite/experimental/ruy/ruy/cpu_cache_size.h
deleted file mode 100644
index 95ed35ec097..00000000000
--- a/tensorflow/lite/experimental/ruy/ruy/cpu_cache_size.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright 2020 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_CPU_CACHE_SIZE_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_CPU_CACHE_SIZE_H_
-
-#include "tensorflow/lite/experimental/ruy/ruy/path.h"
-#include "tensorflow/lite/experimental/ruy/ruy/platform.h"
-
-namespace ruy {
-
-// LocalDataCacheSize returns a sane default size for each CPU core's local
-// data cache, i.e. the largest data cache that is local to that CPU core, not
-// shared with other cores. That allows coarse tuning of code that aims for
-// most of its memory accesses to hit such a typically fast data cache.
-//
-// SharedDataCacheSize returns a sane default size of the total data cache
-// accessible to each CPU, including any shared cache.
-//
-// For example, if we design tune this code for a ARM Cortex-A55 with a local L1
-// cache of 32k, a local L2 cache of 128k and a shared L3 cache of 1M,
-// LocalDataCacheSize should return 128k and SharedDataCacheSize
-// should return 1M.
-//
-// Ideally these values would be queried at runtime, and we should probably
-// do that on x86, but that is hard to do on ARM.
-#if RUY_PLATFORM(ARM_64)
-inline int LocalDataCacheSize() { return 1 << 15; }
-inline int SharedDataCacheSize() { return 1 << 19; }
-#elif RUY_PLATFORM(ARM_32)
-inline int LocalDataCacheSize() { return 1 << 14; }
-inline int SharedDataCacheSize() { return 1 << 18; }
-#elif RUY_PLATFORM(X86)
-inline int LocalDataCacheSize() { return 1 << 17; }
-inline int SharedDataCacheSize() { return 1 << 21; }
-#else
-inline int LocalDataCacheSize() { return 1 << 14; }
-inline int SharedDataCacheSize() { return 1 << 18; }
-#endif
-// Variants taking a Path argument which acts
-// as a hint telling whether we're targeting more or less recent/powerful CPUs.
-inline int LocalDataCacheSize(Path path) {
-#if RUY_PLATFORM(ARM_64)
- if (path == Path::kNeonDotprod) {
- // At the moment, the smallest CPU with dotprod is probably Cortex-A55 with
- // 128k L2 local cache.
- return 1 << 17;
- }
-#else
- (void)path;
-#endif
- return LocalDataCacheSize();
-}
-inline int SharedDataCacheSize(Path path) {
-#if RUY_PLATFORM(ARM_64)
- if (path == Path::kNeonDotprod) {
- // At the moment, the smallest CPU with dotprod is probably Cortex-A55 with
- // 1M L3 shared cache.
- return 1 << 20;
- }
-#else
- (void)path;
-#endif
- return SharedDataCacheSize();
-}
-
-} // namespace ruy
-
-#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_CPU_CACHE_SIZE_H_
diff --git a/tensorflow/lite/experimental/ruy/ruy/detect_arm.cc b/tensorflow/lite/experimental/ruy/ruy/detect_arm.cc
deleted file mode 100644
index 8f6d2c9f9fe..00000000000
--- a/tensorflow/lite/experimental/ruy/ruy/detect_arm.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/* Detection of dotprod instructions on ARM.
- * The current Linux-specific code relies on sufficiently new Linux kernels:
- * At least Linux 4.15 in general; on Android, at least Linux 4.14.111 thanks to
- * a late backport. This was backported just before the Android 10 release, so
- * this is leaving out pre-release Android 10 builds as well as earlier Android
- * versions.
- *
- * It is possible to detect instructions in other ways that don't rely on
- * an OS-provided feature identification mechanism:
- *
- * (A) We used to have a SIGILL-handler-based method that worked at least
- * on Linux. Its downsides were (1) crashes on a few devices where
- * signal handler installation didn't work as intended; (2) additional
- * complexity to generalize to other Unix-ish operating systems including
- * iOS; (3) source code complexity and fragility of anything installing
- * and restoring signal handlers; (4) confusing behavior under a debugger.
- *
- * (B) We also experimented with a fork-ing approach where a subprocess
- * tries the instruction. Compared to (A), this is much simpler and more
- * reliable and portable, but also much higher latency on Android where
- * an uncaught signal typically causes a 100 ms latency.
- *
- * Should there be interest in either technique again in the future,
- * code implementing both (A) and (B) can be found in earlier revisions of this
- * file - in actual code for (A) and in a comment for (B).
- */
-
-#include "tensorflow/lite/experimental/ruy/ruy/detect_arm.h"
-
-#if defined __linux__ && defined __aarch64__
-#include
-#endif
-
-namespace ruy {
-
-namespace {
-
-#if defined __linux__ && defined __aarch64__
-bool DetectDotprodByLinuxAuxvMethod() {
- // This is the value of HWCAP_ASIMDDP in sufficiently recent Linux headers,
- // however we need to support building against older headers for the time
- // being.
- const int kLocalHwcapAsimddp = 1 << 20;
- return getauxval(AT_HWCAP) & kLocalHwcapAsimddp;
-}
-#endif
-
-} // namespace
-
-bool DetectDotprod() {
-#if defined __linux__ && defined __aarch64__
- return DetectDotprodByLinuxAuxvMethod();
-#endif
-
- return false;
-}
-
-} // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/ruy/detect_arm.h b/tensorflow/lite/experimental/ruy/ruy/detect_arm.h
deleted file mode 100644
index 9a1542d3cce..00000000000
--- a/tensorflow/lite/experimental/ruy/ruy/detect_arm.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Temporary dotprod-detection code until we can rely on getauxval.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_DETECT_ARM_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_DETECT_ARM_H_
-
-namespace ruy {
-
-// On A64, returns true if the dotprod extension is present.
-// On other architectures, returns false unconditionally.
-bool DetectDotprod();
-
-} // namespace ruy
-
-#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_DETECT_ARM_H_
diff --git a/tensorflow/lite/experimental/ruy/ruy/detect_x86.cc b/tensorflow/lite/experimental/ruy/ruy/detect_x86.cc
deleted file mode 100644
index 113a73c09e3..00000000000
--- a/tensorflow/lite/experimental/ruy/ruy/detect_x86.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/experimental/ruy/ruy/detect_x86.h"
-
-#include
-
-#if RUY_PLATFORM(X86) && RUY_PLATFORM(X86_ENHANCEMENTS)
-#include // IWYU pragma: keep
-
-#endif
-
-namespace ruy {
-#if RUY_PLATFORM(X86) && RUY_PLATFORM(X86_ENHANCEMENTS)
-
-namespace {
-
-// See Intel docs, such as http://goo.gl/c6IkGX.
-inline void RunCpuid(std::uint32_t eax, std::uint32_t ecx,
- std::uint32_t abcd[4]) {
- std::uint32_t ebx, edx;
-#if defined(__i386__) && defined(__PIC__)
- /* in case of PIC under 32-bit EBX cannot be clobbered */
- asm volatile("movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi"
- : "=D"(ebx),
-#else
- asm volatile("cpuid"
- : "+b"(ebx),
-#endif
- "+a"(eax), "+c"(ecx), "=d"(edx));
- abcd[0] = eax;
- abcd[1] = ebx;
- abcd[2] = ecx;
- abcd[3] = edx;
-}
-
-} // namespace
-
-bool DetectCpuSse42() {
- std::uint32_t abcd[4];
-
- constexpr std::uint32_t kEcxSse42 = 1u << 20;
- RunCpuid(1, 0, abcd);
- const bool has_sse4_2_base = (abcd[2] & kEcxSse42) == kEcxSse42;
-
-#ifdef RUY_ENABLE_AMD_CPUID_CHECKS
- constexpr std::uint32_t kEcxAbm = 1u << 5;
- RunCpuid(0x80000001, 0, abcd);
- const bool has_extras = (abcd[2] & kEcxAbm) == kEcxAbm;
-#else
- constexpr std::uint32_t kEcxPopcnt = 1u << 23;
- RunCpuid(1, 0, abcd);
- const bool has_extras = (abcd[2] & kEcxPopcnt) == kEcxPopcnt;
-#endif
-
- return has_sse4_2_base && has_extras;
-}
-
-bool DetectCpuAvx2() {
- constexpr std::uint32_t kEbxAvx2 = 1u << 5;
- constexpr std::uint32_t kEcxFma = 1u << 12;
-
- std::uint32_t abcd[4];
-
- RunCpuid(7, 0, abcd);
- const bool has_avx2 = (abcd[1] & kEbxAvx2) == kEbxAvx2;
- RunCpuid(1, 0, abcd);
- const bool has_fma = (abcd[2] & kEcxFma) == kEcxFma;
-
- return has_avx2 && has_fma;
-}
-
-bool DetectCpuAvx512() {
- constexpr std::uint32_t kEbxAvx512F = 1u << 16;
- constexpr std::uint32_t kEbxAvx512Dq = 1u << 17;
- constexpr std::uint32_t kEbxAvx512Cd = 1u << 28;
- constexpr std::uint32_t kEbxAvx512Bw = 1u << 30;
- constexpr std::uint32_t kEbxAvx512Vl = 1u << 31;
-
- constexpr std::uint32_t kEbxAvx512Mask =
- kEbxAvx512F | kEbxAvx512Dq | kEbxAvx512Cd | kEbxAvx512Bw | kEbxAvx512Vl;
- std::uint32_t abcd[4];
- RunCpuid(7, 0, abcd);
-
- return (abcd[1] & kEbxAvx512Mask) == kEbxAvx512Mask;
-}
-
-#endif
-} // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/ruy/detect_x86.h b/tensorflow/lite/experimental/ruy/ruy/detect_x86.h
deleted file mode 100644
index 185dabe06a5..00000000000
--- a/tensorflow/lite/experimental/ruy/ruy/detect_x86.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_DETECT_X86_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_DETECT_X86_H_
-
-#include "tensorflow/lite/experimental/ruy/ruy/platform.h"
-
-namespace ruy {
-
-#if RUY_PLATFORM(X86)
-#if RUY_PLATFORM(X86_ENHANCEMENTS)
-
-// This also checks ABM support, which implies LZCNT and POPCNT.
-bool DetectCpuSse42();
-bool DetectCpuAvx2();
-bool DetectCpuAvx512();
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-// TODO(b/146646451): Introduce and activate.
-inline bool DetectCpuAvxVnni() { return false; }
-
-#else // RUY_PLATFORM(X86_ENHANCEMENTS)
-
-inline bool DetectCpuSse42() { return false; }
-inline bool DetectCpuAvx2() { return false; }
-inline bool DetectCpuAvx512() { return false; }
-inline bool DetectCpuAvxVnni() { return false; }
-
-#endif // !RUY_PLATFORM(X86_ENHANCEMENTS)
-#endif // RUY_PLATFORM(X86)
-
-} // namespace ruy
-
-#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_DETECT_X86_H_
diff --git a/tensorflow/lite/experimental/ruy/ruy/dispatch.h b/tensorflow/lite/experimental/ruy/ruy/dispatch.h
deleted file mode 100644
index d1e97e29b9c..00000000000
--- a/tensorflow/lite/experimental/ruy/ruy/dispatch.h
+++ /dev/null
@@ -1,482 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file implements the translation between Ruy's entry point (ruy::Mul) and
-// the internal implementation of matrix multiplication.
-//
-// The primary elements of this dispatch are:
-// - pick suitable gemm kernel and packing routines for the user-specified
-// CompiledPaths based on the current CPU.
-// - decide on the structure of the packed matrices needed by the internal
-// implementation (see pack.h for more information on packing).
-// - translate the Mul operation into TrMul (see trmul.h for why that is
-// useful). This is done by changing the matrix Layout -- no matrix data is
-// actually moved.
-//
-// This file is also factored to serve as a building block for the advanced API
-// as well.
-//
-// This file also performs some checking of invariants to catch user errors.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_DISPATCH_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_DISPATCH_H_
-
-#include
-#include
-#include // IWYU pragma: keep
-#include
-
-#include "tensorflow/lite/experimental/ruy/ruy/check_macros.h"
-#include "tensorflow/lite/experimental/ruy/ruy/common.h"
-#include "tensorflow/lite/experimental/ruy/ruy/context.h"
-#include "tensorflow/lite/experimental/ruy/ruy/internal_matrix.h"
-#include "tensorflow/lite/experimental/ruy/ruy/kernel.h"
-#include "tensorflow/lite/experimental/ruy/ruy/kernel_common.h"
-#include "tensorflow/lite/experimental/ruy/ruy/matrix.h"
-#include "tensorflow/lite/experimental/ruy/ruy/opt_set.h"
-#include "tensorflow/lite/experimental/ruy/ruy/pack.h"
-#include "tensorflow/lite/experimental/ruy/ruy/pack_common.h"
-#include "tensorflow/lite/experimental/ruy/ruy/path.h"
-#include "tensorflow/lite/experimental/ruy/ruy/profiler/instrumentation.h"
-#include "tensorflow/lite/experimental/ruy/ruy/side_pair.h"
-#include "tensorflow/lite/experimental/ruy/ruy/size_util.h"
-#include "tensorflow/lite/experimental/ruy/ruy/spec.h"
-#include "tensorflow/lite/experimental/ruy/ruy/trmul.h"
-#include "tensorflow/lite/experimental/ruy/ruy/trmul_params.h"
-
-namespace ruy {
-
-// If the Spec's LayoutSupport covers only some special cases,
-// this function enforces that the matrix multiplication at hand falls into
-// that special case.
-template
-void EnforceLayoutSupport(const Layout& lhs_layout, const Layout& rhs_layout,
- const Layout& dst_layout) {
- if (Spec::kLayoutSupport == LayoutSupport::kRCC) {
- RUY_DCHECK(IsRowMajor(lhs_layout));
- RUY_DCHECK(IsColMajor(rhs_layout));
- RUY_DCHECK(IsColMajor(dst_layout));
- }
-}
-
-template
-bool IsSymmetricZeroPoint(Scalar zero_point) {
- return zero_point == SymmetricZeroPoint();
-}
-
-template
-void CheckZeroPoint(Scalar zero_point) {
- if (std::is_floating_point::value ||
- Spec::kZeroPointSupport == ZeroPointSupport::kSymmetric) {
- RUY_DCHECK(IsSymmetricZeroPoint(zero_point));
- }
-}
-
-template
-void EnforceZeroPointSupport(LhsScalar lhs_zero_point, RhsScalar rhs_zero_point,
- DstScalar dst_zero_point) {
- // If the Spec's ZeroPointSupport covers only some special cases,
- // this function enforces that the matrix multiplication at hand falls into
- // that special case.
- CheckZeroPoint(lhs_zero_point);
- CheckZeroPoint(rhs_zero_point);
- CheckZeroPoint(dst_zero_point);
-
- // Guard against the case when both LHS and RHS zero_point's are equal to
- // the minimum representable value. In that case, padding with zero_point
- // values will generate the bad case for fast int8 kernels on NEON
- // (pre-dotprod) which attempt to multiply-accumulate two pairs of int8
- // into a int16: this is safe except in the bad case -128*-128 + -128*-128.
- // See b/131609283. This only affects the kNeon path but we ban this for all
- // paths in order for ruy to have the same supported parameter space
- // on all paths.
- RUY_DCHECK(lhs_zero_point != std::numeric_limits::lowest() ||
- rhs_zero_point != std::numeric_limits::lowest());
-}
-
-template
-void EnforceDstSpecSupport(const Spec& spec, DstScalar dst_zero_point) {
- static_assert(std::is_same::value, "");
- if (!std::is_same::value) return;
-
- // If user is looking for the raw accumulator, zero_point and all the other
- // dequantize fields don't make sense and should not be set.
- RUY_DCHECK_EQ(dst_zero_point, 0);
- RUY_DCHECK_EQ(spec.clamp_max, std::numeric_limits::max());
- RUY_DCHECK_EQ(spec.clamp_min, std::numeric_limits::min());
- RUY_DCHECK_EQ(spec.multiplier_fixedpoint, 0);
- RUY_DCHECK_EQ(spec.multiplier_exponent, 0);
- RUY_DCHECK_EQ(spec.multiplier_fixedpoint_perchannel, nullptr);
- RUY_DCHECK_EQ(spec.multiplier_exponent_perchannel, nullptr);
-}
-
-inline bool IsColMajorTrMul(const TrMulParams& params) {
- return IsColMajor(params.src[Side::kLhs].layout) &&
- IsColMajor(params.src[Side::kRhs].layout) &&
- IsColMajor(params.dst.layout);
-}
-
-inline void CreatePackedLayout(const Layout& src, const Type& scalar,
- const KernelLayout& kernel_layout,
- PackedLayout* packed) {
- packed->order = Order::kColMajor;
- packed->rows = round_up_pot(src.rows, kernel_layout.rows);
- packed->cols = round_up_pot(src.cols, kernel_layout.cols);
- packed->kernel = kernel_layout;
- int inner_size = packed->rows;
- if (RUY_OPT_ENABLED(RUY_OPT_AVOID_ALIASING)) {
- packed->stride =
- (inner_size * scalar.size) % 1024 ? inner_size : inner_size + 64;
- } else {
- packed->stride = inner_size;
- }
-}
-
-template
-void CreatePackedMatrix(Side side, const KernelLayout& kernel_layout,
- TrMulParams* params) {
- // Ruy always uses 32-bit signed accumulators for quantized
- // matrix multiplication, so we would like to always use std::int32_t
- // unconditionally for SumsType.
- // However, for floating point types, we still need a reasonable type here to
- // avoid tripping assertions elsewhere in the code.
- using SumsType =
- typename std::conditional::value, Scalar,
- std::int32_t>::type;
-
- const DMatrix& src = params->src[side];
- PMatrix* packed = ¶ms->packed[side];
- packed->data_type = Type::Create();
- packed->sums_type = Type::Create();
- CreatePackedLayout(src.layout, packed->data_type, kernel_layout,
- &packed->layout);
- packed->zero_point = Pack(src.zero_point);
-}
-
-template
-void PopulateTrMulParams(TrMulParams* params) {
- static_assert((ThePath & Path::kReference) == Path::kNone,
- "Path::kReference should not do TrMul");
- // The optimized code paths don't handle the full generality of Ruy's API.
- // Fall back to Path::kStandardCpp if necessary.
- bool fallback_to_standard_cpp = false;
- if (ThePath != Path::kStandardCpp) {
- // The optimized code paths currently only handle the case of all matrices
- // being column major.
- if (!IsColMajorTrMul(*params)) {
- fallback_to_standard_cpp = true;
- }
- }
-
- if (fallback_to_standard_cpp) {
- PopulateTrMulParams(params);
- return;
- }
-
- using PackedLhsScalar = PackedType;
- using PackedRhsScalar = PackedType;
- using Kernel =
- Kernel;
- using LhsKernelLayout = typename Kernel::LhsLayout;
- using RhsKernelLayout = typename Kernel::RhsLayout;
-
- params->path = ThePath;
-
- params->local_data_cache_size = Spec::local_data_cache_size();
- params->shared_data_cache_size = Spec::shared_data_cache_size();
-
- CreatePackedMatrix(
- Side::kLhs, ToKernelLayout(), params);
- CreatePackedMatrix(
- Side::kRhs, ToKernelLayout(), params);
- params->run_pack[Side::kLhs] =
- &RunPack;
- params->run_pack[Side::kRhs] =
- &RunPack;
- params->run_kernel =
- &RunKernel;
-
- return;
-}
-
-// PopulateTrMulParamsAllCompiledPaths calls into one of multiple
-// instantiations of PopulateTrMulParams. For each bit that is set in
-// CompiledPaths, it statically instantiates PopulateTrMulParams with a Path
-// corresponding to that single bit. The call to PopulateTrMulParams is
-// guarded by a runtime check that it is in fact the dynamically selected path.
-//
-// PopulateTrMulParamsAllCompiledPaths is implemented with template
-// metaprogramming by mutual recursion between PathSearchCountdown and
-// PathSearchCompiledPaths.
-//
-// PopulateTrMulParamsAllCompiledPaths is logically implementing the following
-// computation:
-//
-// template
-// void PopulateTrMulParamsAllCompiledPaths(Path the_path,
-// TrMulParams* params) {
-// for (int bit = 8 * sizeof(Path) - 1; bit != -1; bit--) { // [1]
-// Path current_path = static_cast(1 << bit);
-// if ((CompiledPaths & current_path) != Path::kNone) { // [2]
-// if (current_path == the_path) { // [3]
-// PopulateTrMulParams(the_path, params);
-// return;
-// }
-// }
-// }
-// }
-//
-//
-//
-// [1] - Done by the main definition of PathSearchCountdown. The `bit--` is
-// done in the recursion of PathSearchOnlyCompiledPaths.
-// [2] - Done by PathSearchOnlyCompiledPaths's partial template
-// specialization on InCompiledPaths. This is the check which necessitates
-// doing the whole computation at C++ compile time.
-// [3] - Done by the `if` in the main definition of
-// PathSearchOnlyCompiledPaths.
-//
-// The template metaprogramming is necessary because:
-// - In `PopulateTrMulParams`, current_path must be a C++
-// compile-time constant.
-// - PopulateTrMulParamsAllCompiledPaths must not instantiate
-// inner loops for paths that are not in CompiledPaths, since that can result in
-// bogus instantiations which cause a compile time failure.
-template
-struct PathSearchCountdown;
-
-template
-struct PathSearchOnlyCompiledPaths {
- static constexpr Path kCurrentPath = static_cast(1 << BitNumber);
- static void Search(Path the_path, TrMulParams* params) {
- if (kCurrentPath == the_path) {
- PopulateTrMulParams(
- params);
- return;
- }
- PathSearchCountdown::Search(the_path, params);
- }
-};
-
-// Skip this iteration if CompiledPaths doesn't contain the specified path.
-template
-struct PathSearchOnlyCompiledPaths {
- static void Search(Path the_path, TrMulParams* params) {
- PathSearchCountdown::Search(the_path, params);
- }
-};
-
-template
-struct PathSearchCountdown {
- static constexpr Path kCurrentPath = static_cast(1 << BitNumber);
- static void Search(Path the_path, TrMulParams* params) {
- PathSearchOnlyCompiledPaths<
- CompiledPaths, (CompiledPaths & kCurrentPath) != Path::kNone, BitNumber,
- LhsScalar, RhsScalar, DstScalar, Spec>::Search(the_path, params);
- }
-};
-
-// Termination of the countdown. If the counter reaches -1, then we haven't
-// found the specified path.
-template
-struct PathSearchCountdown {
- static void Search(Path the_path, TrMulParams* params) { RUY_DCHECK(false); }
-};
-
-template
-void PopulateTrMulParamsAllCompiledPaths(Path the_path, TrMulParams* params) {
- return PathSearchCountdown::Search(the_path,
- params);
-}
-
-template
-void CreateTrMulParams(const Matrix& lhs,
- const Matrix& rhs, const Spec& spec,
- Context* context, Matrix* dst, Path the_path,
- TrMulParams* params) {
- // Fill in the fields we already know.
- params->src[Side::kLhs] = ToDMatrix(lhs);
- params->src[Side::kRhs] = ToDMatrix(rhs);
- params->dst = ToDMatrix(*dst);
- params->spec = ToVoidPtr(&spec);
-
- // Create inner loops and packed matrices based on the Path.
- PopulateTrMulParamsAllCompiledPaths(the_path, params);
-}
-
-template
-void ReferenceMul(const Matrix& lhs, const Matrix& rhs,
- const Spec& spec, Matrix* dst) {
- profiler::ScopeLabel label("ReferenceMul");
- for (int i = 0; i < lhs.layout.rows; i++) {
- for (int j = 0; j < rhs.layout.cols; j++) {
- using AccumScalar = typename Spec::AccumScalar;
- AccumScalar accum = 0;
- for (int k = 0; k < lhs.layout.cols; k++) {
- AccumScalar lhs_val = Element(lhs, i, k);
- AccumScalar rhs_val = Element(rhs, k, j);
- accum += (lhs_val - lhs.zero_point) * (rhs_val - rhs.zero_point);
- }
- if (spec.bias) {
- accum += spec.bias[i];
- }
- ApplyMultiplier(spec, i, &accum);
- accum += dst->zero_point;
- accum = std::min(accum, spec.clamp_max);
- accum = std::max(accum, spec.clamp_min);
- *ElementPtr(dst, i, j) = static_cast(accum);
- }
- }
-}
-
-// Compile-time dispatch to ReferenceMul. This allows us to statically ensure
-// that there is no call to ReferenceMul in the user's binary.
-template
-struct CompileTimeEnabledReferenceMul {
- template
- static void Run(const Matrix& lhs, const Matrix& rhs,
- const Spec& spec, Matrix* dst) {
- ReferenceMul(lhs, rhs, spec, dst);
- }
-};
-
-// When this partial specialization is chosen, it ensures that ReferenceMul
-// is never compiled.
-template <>
-struct CompileTimeEnabledReferenceMul*ReferenceMulIsEnabled=*/false> {
- template
- static void Run(const Matrix& lhs, const Matrix& rhs,
- const Spec& spec, Matrix* dst) {
- RUY_DCHECK(false);
- }
-};
-
-inline void HandlePrepackedCaching(TrMulParams* params,
- const SidePair& cacheable,
- Context* context) {
- if (context->cache_policy == CachePolicy::kNoCache) {
- return;
- }
-
- if (context->cache_policy == CachePolicy::kCacheLHSOnNarrowMul) {
- // TODO(b/149304278) Cache on dst.cols <= selected kernel width.
- if (!cacheable[Side::kLhs] || params->dst.layout.cols > 4) {
- return;
- }
- PrepackedCache* prepacked_cache = context->GetPrepackedCache();
- auto cache_key = std::make_pair(reinterpret_cast(params->run_kernel),
- params->src[Side::kLhs].data);
- auto it = prepacked_cache->FindAndUpdate(cache_key);
- if (it != prepacked_cache->cend()) {
- params->packed[Side::kLhs].data = it->second.first.data;
- params->packed[Side::kLhs].sums = it->second.first.sums;
- params->is_prepacked[Side::kLhs] = true;
- return;
- }
-
- // Allocate the prepacked matrix.
- PrepackedMatrix prepacked_lhs;
- prepacked_lhs.data_size = DataSize(params->packed[Side::kLhs]);
- prepacked_lhs.sums_size = SumsSize(params->packed[Side::kLhs]);
- prepacked_cache->AllocatePrepackedMatrix(&prepacked_lhs);
- params->packed[Side::kLhs].data = prepacked_lhs.data;
- params->packed[Side::kLhs].sums = prepacked_lhs.sums;
- params->is_prepacked[Side::kLhs] = true;
- Tuning tuning = context->GetMainThreadTuning();
- params->RunPack(Side::kLhs, tuning, 0,
- params->packed[Side::kLhs].layout.cols);
- prepacked_cache->Insert(cache_key, prepacked_lhs);
- return;
- }
-}
-
-template
-void DispatchMul(const Matrix& lhs, const Matrix& rhs,
- const Spec& spec, Context* context, Matrix* dst) {
- static_assert(CompiledPaths != Path::kNone, "Must compile at least one Path");
- static_assert((CompiledPaths & ~kAllPaths) == Path::kNone,
- "CompiledPaths must be a subset of ruy::kAllPaths");
-
- profiler::ScopeLabel mul_label("Mul");
- profiler::ScopeLabel shape_specific_label("matmul shape: %dx%dx%d",
- lhs.layout.rows, lhs.layout.cols,
- rhs.layout.cols);
-
- EnforceLayoutSupport(lhs.layout, rhs.layout, dst->layout);
- EnforceZeroPointSupport(lhs.zero_point, rhs.zero_point,
- dst->zero_point);
- EnforceDstSpecSupport(spec, dst->zero_point);
-
- // This should be a constant, for a given machine and CompiledPaths.
- // There is a back door to override it for testing, but in production it will
- // always be the "best" Path. I.e. the one with the newest SIMD instructions
- // available on the present machine, and avoiding Path::kReference unless
- // no other path is compiled.
- //
- // Unfortunately, it is not a *static* constant, since it depends on runtime
- // detection of the available SIMD instructions.
- Path the_path = context->GetPathToTake();
-
- // Production code should probably never execute Path::kReference.
- // Path::kReference implements a Mul, not a TrMul like the rest of Ruy, so if
- // that's what we need to do, then get it out of the way before going down the
- // TrMul path.
- if (the_path == Path::kReference) {
- constexpr bool ReferenceMulIsEnabled =
- (CompiledPaths & Path::kReference) != Path::kNone;
- CompileTimeEnabledReferenceMul::Run(lhs, rhs, spec,
- dst);
- return;
- }
-
- // As described in the comment at the top of this file, Ruy internally
- // converts Mul into TrMul. We handle that here.
- //
- // This is Ruy's main code path.
- constexpr Path TrMulCompiledPaths = CompiledPaths & ~Path::kReference;
- Matrix transposed_lhs(lhs);
- Transpose(&transposed_lhs);
- TrMulParams params;
- CreateTrMulParams(transposed_lhs, rhs, spec, context, dst,
- the_path, ¶ms);
- SidePair cacheable(lhs.cacheable, rhs.cacheable);
- HandlePrepackedCaching(¶ms, cacheable, context);
- TrMul(¶ms, context);
-}
-
-} // namespace ruy
-
-#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_DISPATCH_H_
diff --git a/tensorflow/lite/experimental/ruy/ruy/example.cc b/tensorflow/lite/experimental/ruy/ruy/example.cc
deleted file mode 100644
index 5d31d6c2e3e..00000000000
--- a/tensorflow/lite/experimental/ruy/ruy/example.cc
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include
-#include
-
-#include "tensorflow/lite/experimental/ruy/ruy/ruy.h"
-
-void ExampleMulFloat(ruy::Context *context) {
- const float lhs_data[] = {1, 2, 3, 4};
- const float rhs_data[] = {1, 2, 3, 4};
- float dst_data[4];
-
- ruy::Matrix lhs;
- ruy::MakeSimpleLayout(2, 2, ruy::Order::kRowMajor, &lhs.layout);
- lhs.data = lhs_data;
- ruy::Matrix rhs;
- ruy::MakeSimpleLayout(2, 2, ruy::Order::kColMajor, &rhs.layout);
- rhs.data = rhs_data;
- ruy::Matrix dst;
- ruy::MakeSimpleLayout(2, 2, ruy::Order::kColMajor, &dst.layout);
- dst.data = dst_data;
-
- ruy::BasicSpec spec;
- ruy::Mul(lhs, rhs, spec, context, &dst);
-
- std::cout << "Example Mul, float:\n";
- std::cout << "LHS:\n" << lhs;
- std::cout << "RHS:\n" << rhs;
- std::cout << "Result:\n" << dst << "\n";
-}
-
-void ExampleMulFloatWithBiasAddAndClamp(ruy::Context *context) {
- const float lhs_data[] = {1, 2, 3, 4};
- const float rhs_data[] = {1, 2, 3, 4};
- const float bias_data[] = {1, 0};
- float dst_data[4];
-
- ruy::Matrix lhs;
- ruy::MakeSimpleLayout(2, 2, ruy::Order::kRowMajor, &lhs.layout);
- lhs.data = lhs_data;
- ruy::Matrix rhs;
- ruy::MakeSimpleLayout(2, 2, ruy::Order::kColMajor, &rhs.layout);
- rhs.data = rhs_data;
- ruy::Matrix dst;
- ruy::MakeSimpleLayout(2, 2, ruy::Order::kColMajor, &dst.layout);
- dst.data = dst_data;
-
- ruy::BasicSpec spec;
- spec.bias = bias_data;
- spec.clamp_min = 0;
- spec.clamp_max = 15;
- ruy::Mul(lhs, rhs, spec, context, &dst);
-
- std::cout << "Example Mul, float with bias addition and clamp:\n";
- std::cout << "LHS:\n" << lhs;
- std::cout << "RHS:\n" << rhs;
- std::cout << "Result:\n" << dst << "\n";
-}
-
-void ExampleMulUint8AsymmetricQuantized(ruy::Context *context) {
- const std::uint8_t lhs_data[] = {124, 125, 126, 127};
- const std::uint8_t rhs_data[] = {129, 130, 131, 132};
- std::uint8_t dst_data[4];
-
- ruy::Matrix lhs;
- ruy::MakeSimpleLayout(2, 2, ruy::Order::kRowMajor, &lhs.layout);
- lhs.data = lhs_data;
- lhs.zero_point = 125;
- ruy::Matrix rhs;
- ruy::MakeSimpleLayout(2, 2, ruy::Order::kColMajor, &rhs.layout);
- rhs.data = rhs_data;
- rhs.zero_point = 132;
- ruy::Matrix dst;
- ruy::MakeSimpleLayout(2, 2, ruy::Order::kColMajor, &dst.layout);
- dst.data = dst_data;
- dst.zero_point = 129;
-
- ruy::BasicSpec spec;
- spec.multiplier_fixedpoint = 1 << 30;
-
- spec.multiplier_exponent = 0;
- ruy::Mul(lhs, rhs, spec, context, &dst);
-
- std::cout << "Example Mul, uint8 quantized with asymmetric zero points:\n";
- std::cout << "LHS:\n" << lhs;
- std::cout << "RHS:\n" << rhs;
- std::cout << "Result:\n" << dst << "\n";
-}
-void ExampleMulInt8PerChannelQuantized(ruy::Context *context) {
- const std::int8_t lhs_data[] = {1, 2, 3, 4};
- const std::int8_t rhs_data[] = {1, 2, 3, 4};
- const std::int32_t multiplier_data[] = {3 << 28, 5 << 28};
- const int exponent_data[] = {1, -2};
- std::int8_t dst_data[4];
-
- ruy::Matrix lhs;
- ruy::MakeSimpleLayout(2, 2, ruy::Order::kRowMajor, &lhs.layout);
- lhs.data = lhs_data;
- ruy::Matrix rhs;
- ruy::MakeSimpleLayout(2, 2, ruy::Order::kColMajor, &rhs.layout);
- rhs.data = rhs_data;
- ruy::Matrix dst;
- ruy::MakeSimpleLayout(2, 2, ruy::Order::kColMajor, &dst.layout);
- dst.data = dst_data;
-
- ruy::BasicSpec spec;
- spec.multiplier_fixedpoint_perchannel = multiplier_data;
- spec.multiplier_exponent_perchannel = exponent_data;
- ruy::Mul(lhs, rhs, spec, context, &dst);
-
- std::cout << "Example Mul, int8 quantized with per-channel multipliers\n";
- std::cout << "LHS:\n" << lhs;
- std::cout << "RHS:\n" << rhs;
- std::cout << "Result:\n" << dst << "\n";
-}
-
-int main() {
- ruy::Context context;
- ExampleMulFloat(&context);
- ExampleMulFloatWithBiasAddAndClamp(&context);
- ExampleMulUint8AsymmetricQuantized(&context);
- ExampleMulInt8PerChannelQuantized(&context);
-}
diff --git a/tensorflow/lite/experimental/ruy/ruy/example_advanced.cc b/tensorflow/lite/experimental/ruy/ruy/example_advanced.cc
deleted file mode 100644
index 9e1dd17f86d..00000000000
--- a/tensorflow/lite/experimental/ruy/ruy/example_advanced.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include
-#include
-#include
-#include
-
-#include "tensorflow/lite/experimental/ruy/ruy/ruy_advanced.h"
-
-// Simple allocator for allocating pre-packed matrices.
-class SimpleAllocator {
- public:
- void* AllocateBytes(std::size_t num_bytes) {
- char* p = new char[num_bytes];
- buffers_.emplace_back(p);
- return static_cast(p);
- }
-
- private:
- std::vector> buffers_;
-};
-
-void ExamplePrepack(ruy::Context* context) {
- const float lhs_data[] = {1, 2, 3, 4};
- const float rhs_data[] = {1, 2, 3, 4};
- float dst_data[4];
-
- // Set up the matrix layouts and spec.
- ruy::Matrix