Specify -O3 and, on ARM32, -mfpu=neon as rule copts, for all our binary rules.

See the comment.

PiperOrigin-RevId: 262564280
This commit is contained in:
Benoit Jacob 2019-08-09 08:27:22 -07:00 committed by TensorFlower Gardener
parent a79c52ed09
commit 6c526e012c
2 changed files with 57 additions and 6 deletions

View File

@ -7,6 +7,25 @@ load(":ruy_test_ext.bzl", "ruy_test_ext_defines", "ruy_test_ext_deps")
load(":ruy_test.bzl", "ruy_benchmark", "ruy_benchmark_opt_sets", "ruy_test")
load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
# 1. Enable -mfpu=neon unconditionally on ARM32. If it turns out that we need to support
# ARM32 without NEON then we'll implement runtime detection and dispatch at that point.
# 2. Explicitly pass -O3 on mobile configs where just "-c opt" means "optimize for code size".
# We would want to only do that when compilation_mode is "opt", but limitations of
# the "select" keyword (no nested selects, no AND boolean) seem to make that difficult
# at the moment. For debugging purposes, this can be overridded on the command line, e.g.
# bazel build -c dbg --copt=-O0 ...
RUY_COPTS = select({
"//tensorflow:android_arm64": [
"-O3",
],
"//tensorflow:android_arm": [
"-O3",
"-mfpu=neon",
],
"//conditions:default": [
],
})
package(
default_visibility = ["//visibility:private"],
licenses = ["notice"], # Apache 2.0
@ -15,28 +34,33 @@ package(
cc_library(
name = "platform",
hdrs = ["platform.h"],
copts = RUY_COPTS,
)
cc_library(
name = "check_macros",
hdrs = ["check_macros.h"],
copts = RUY_COPTS,
deps = ["//tensorflow/lite/kernels/internal:compatibility"],
)
cc_library(
name = "opt_set",
hdrs = ["opt_set.h"],
copts = RUY_COPTS,
)
cc_library(
name = "time",
hdrs = ["time.h"],
copts = RUY_COPTS,
)
cc_library(
name = "wait",
srcs = ["wait.cc"],
hdrs = ["wait.h"],
copts = RUY_COPTS,
deps = [":time"],
)
@ -52,6 +76,7 @@ cc_test(
cc_library(
name = "size_util",
hdrs = ["size_util.h"],
copts = RUY_COPTS,
deps = [":check_macros"],
)
@ -63,6 +88,7 @@ cc_library(
hdrs = [
"tune.h",
],
copts = RUY_COPTS,
deps = [
":opt_set",
":platform",
@ -95,6 +121,7 @@ cc_library(
hdrs = [
"allocator.h",
],
copts = RUY_COPTS,
deps = [
":check_macros",
":size_util",
@ -113,6 +140,7 @@ cc_test(
cc_library(
name = "side_pair",
hdrs = ["side_pair.h"],
copts = RUY_COPTS,
deps = [":check_macros"],
)
@ -124,6 +152,7 @@ cc_library(
hdrs = [
"block_map.h",
],
copts = RUY_COPTS,
deps = [
":check_macros",
":opt_set",
@ -141,6 +170,7 @@ cc_library(
hdrs = [
"blocking_counter.h",
],
copts = RUY_COPTS,
deps = [
":check_macros",
":wait",
@ -155,6 +185,7 @@ cc_library(
hdrs = [
"thread_pool.h",
],
copts = RUY_COPTS,
visibility = ruy_visibility(),
deps = [
":blocking_counter",
@ -171,12 +202,14 @@ cc_library(
hdrs = [
"detect_dotprod.h",
],
copts = RUY_COPTS,
visibility = ruy_visibility(),
)
cc_library(
name = "path",
hdrs = ["path.h"],
copts = RUY_COPTS,
visibility = ruy_visibility(),
deps = [
":platform",
@ -192,6 +225,7 @@ cc_library(
hdrs = [
"trace.h",
],
copts = RUY_COPTS,
deps = [
":block_map",
":check_macros",
@ -208,6 +242,7 @@ cc_library(
hdrs = [
"context.h",
],
copts = RUY_COPTS,
visibility = ruy_visibility(),
deps = [
":allocator",
@ -223,6 +258,7 @@ cc_library(
cc_library(
name = "matrix",
hdrs = ["matrix.h"],
copts = RUY_COPTS,
visibility = ruy_visibility(),
deps = [":check_macros"],
)
@ -230,6 +266,7 @@ cc_library(
cc_library(
name = "spec",
hdrs = ["spec.h"],
copts = RUY_COPTS,
visibility = ruy_visibility(),
deps = [":matrix"],
)
@ -237,6 +274,7 @@ cc_library(
cc_library(
name = "internal_matrix",
hdrs = ["internal_matrix.h"],
copts = RUY_COPTS,
deps = [
":check_macros",
":common",
@ -250,6 +288,7 @@ cc_library(
hdrs = [
"common.h",
],
copts = RUY_COPTS,
deps = [
":check_macros",
":matrix",
@ -272,6 +311,7 @@ cc_library(
"kernel_common.h",
"kernel_x86.h",
],
copts = RUY_COPTS,
deps = [
":check_macros",
":common",
@ -301,6 +341,7 @@ cc_library(
"pack_common.h",
"pack_x86.h",
],
copts = RUY_COPTS,
deps = [
":check_macros",
":common",
@ -317,6 +358,7 @@ cc_library(
cc_library(
name = "trmul_params",
hdrs = ["trmul_params.h"],
copts = RUY_COPTS,
deps = [
":internal_matrix",
":side_pair",
@ -328,6 +370,7 @@ cc_library(
name = "trmul",
srcs = ["trmul.cc"],
hdrs = ["trmul.h"],
copts = RUY_COPTS,
deps = [
":allocator",
":block_map",
@ -359,6 +402,7 @@ cc_library(
"ruy.h",
"ruy_advanced.h",
],
copts = RUY_COPTS,
visibility = ruy_visibility(),
deps = [
":check_macros",
@ -414,6 +458,7 @@ cc_library(
testonly = True,
srcs = ["pmu.cc"],
hdrs = ["pmu.h"],
copts = RUY_COPTS,
deps = [":check_macros"],
)
@ -422,6 +467,7 @@ cc_library(
name = "test_lib",
testonly = True,
hdrs = ["test.h"],
copts = RUY_COPTS,
# need defines, not copts, because it's controlling a header, test.h
defines = ruy_test_ext_defines(),
linkopts = select({
@ -442,6 +488,7 @@ cc_library(
ruy_benchmark(
name = "benchmark",
srcs = ["benchmark.cc"],
copts = RUY_COPTS,
lhs_rhs_accum_dst = [
("f32", "f32", "f32", "f32"),
("u8", "u8", "i32", "u8"),
@ -455,6 +502,7 @@ ruy_benchmark(
ruy_test(
name = "test_fast",
srcs = ["test_fast.cc"],
copts = RUY_COPTS,
lhs_rhs_accum_dst = [
("f32", "f32", "f32", "f32"),
("f64", "f32", "f64", "f32"),
@ -470,6 +518,7 @@ ruy_test(
ruy_test(
name = "test_slow",
srcs = ["test_slow.cc"],
copts = RUY_COPTS,
lhs_rhs_accum_dst = [
("f32", "f32", "f32", "f32"),
("u8", "u8", "i32", "u8"),
@ -483,6 +532,7 @@ ruy_test(
ruy_test(
name = "test_special_specs",
srcs = ["test_special_specs.cc"],
copts = RUY_COPTS,
lhs_rhs_accum_dst = [
("f32", "f32", "f32", "f32"),
("u8", "u8", "i32", "u8"),
@ -493,6 +543,7 @@ ruy_test(
ruy_benchmark_opt_sets(
name = "benchmark_opt_set",
srcs = ["benchmark.cc"],
copts = RUY_COPTS,
lhs_rhs_accum_dst = [
("f32", "f32", "f32", "f32"),
("u8", "u8", "i32", "u8"),

View File

@ -6,12 +6,12 @@ corresponding to tuples of types for LHS, RHS, accumulator
and destination.
"""
def ruy_test(name, srcs, lhs_rhs_accum_dst, tags = []):
def ruy_test(name, srcs, lhs_rhs_accum_dst, copts, tags = []):
for (lhs, rhs, accum, dst) in lhs_rhs_accum_dst:
native.cc_test(
name = "%s_%s_%s_%s_%s" % (name, lhs, rhs, accum, dst),
srcs = srcs,
copts = [
copts = copts + [
"-DRUY_TEST_LHSSCALAR=%s" % lhs,
"-DRUY_TEST_RHSSCALAR=%s" % rhs,
"-DRUY_TEST_ACCUMSCALAR=%s" % accum,
@ -24,14 +24,14 @@ def ruy_test(name, srcs, lhs_rhs_accum_dst, tags = []):
tags = tags,
)
def ruy_benchmark(name, srcs, lhs_rhs_accum_dst):
def ruy_benchmark(name, srcs, lhs_rhs_accum_dst, copts):
tags = ["req_dep=@gemmlowp//:profiler"]
for (lhs, rhs, accum, dst) in lhs_rhs_accum_dst:
native.cc_binary(
name = "%s_%s_%s_%s_%s" % (name, lhs, rhs, accum, dst),
testonly = True,
srcs = srcs,
copts = [
copts = copts + [
"-DRUY_TEST_LHSSCALAR=%s" % lhs,
"-DRUY_TEST_RHSSCALAR=%s" % rhs,
"-DRUY_TEST_ACCUMSCALAR=%s" % accum,
@ -44,7 +44,7 @@ def ruy_benchmark(name, srcs, lhs_rhs_accum_dst):
tags = tags,
)
def ruy_benchmark_opt_sets(name, opt_sets, srcs, lhs_rhs_accum_dst):
def ruy_benchmark_opt_sets(name, opt_sets, srcs, lhs_rhs_accum_dst, copts):
tags = ["req_dep=@gemmlowp//:profiler"]
for opt_set in opt_sets:
for (lhs, rhs, accum, dst) in lhs_rhs_accum_dst:
@ -52,7 +52,7 @@ def ruy_benchmark_opt_sets(name, opt_sets, srcs, lhs_rhs_accum_dst):
name = "%s_%s_%s_%s_%s_%s" % (name, opt_set, lhs, rhs, accum, dst),
testonly = True,
srcs = srcs,
copts = [
copts = copts + [
"-DRUY_TEST_LHSSCALAR=%s" % lhs,
"-DRUY_TEST_RHSSCALAR=%s" % rhs,
"-DRUY_TEST_ACCUMSCALAR=%s" % accum,