319411158 by A. Unique TensorFlower<gardener@tensorflow.org>: Integrate LLVM at https://github.com/llvm/llvm-project/commit/d6343e607ac8 -- 319410296 by A. Unique TensorFlower<gardener@tensorflow.org>: [XLA] Implement extra prefetch limit for while uses. Outstanding prefetch limits can prevent prefetches from being scheduled for the duration of while loops. Since using alternate memory for the while loops can be more beneficial, allow specifying additional prefetch limit when the use is a while HLO. -- 319406145 by A. Unique TensorFlower<gardener@tensorflow.org>: [XLA:CPU] Teach dot_op_emitter how to tile&vectorize linalg matmuls And turn them on by default. This is on-par with the existing emitter, sometimes better and unlocks more potential. The strategy classes are duplicated right now, but I expect them to graduate to mlir core soon. I'm planning to remove the custom LLVM IR emitters if this turns out to be stable enough. -- 319402982 by A. Unique TensorFlower<gardener@tensorflow.org>: PR #40327: [ROCm] Enabling optimized FusedBatchNormInferenceMetaKernel for half Imported from GitHub PR https://github.com/tensorflow/tensorflow/pull/40327 This PR enables optimized FusedBatchNormInferenceMetaKernel for half on ROCm. Copybara import of the project: -- 5f658e2bc1b20794239658bffe0d7bf9cb89c81f by Eugene Kuznetsov <eugene.kuznetsov@amd.com>: Enabling optimized FusedBatchNormInferenceMetaKernel for half -- 319393611 by A. Unique TensorFlower<gardener@tensorflow.org>: Integrate LLVM at https://github.com/llvm/llvm-project/commit/68498ce8af37 -- 319374663 by A. Unique TensorFlower<gardener@tensorflow.org>: compat: Update forward compatibility horizon to 2020-07-02 -- 319374662 by A. Unique TensorFlower<gardener@tensorflow.org>: Update GraphDef version to 450. -- 319371388 by A. Unique TensorFlower<gardener@tensorflow.org>: Update framework_build_test targets -- 319363982 by A. Unique TensorFlower<gardener@tensorflow.org>: Resolve the permission denied error on Python 3.7 pip install. -- 319361498 by A. Unique TensorFlower<gardener@tensorflow.org>: Add an option to only log parameters whose values are parsed from cmdline flags in the benchmark tool. -- 319356677 by A. Unique TensorFlower<gardener@tensorflow.org>: Fix bug in ReadNonConstantTensor assigning new value to the reference don't update the reference, so use pointer instead. -- 319350974 by A. Unique TensorFlower<gardener@tensorflow.org>: Fix the header inclusion path issue with TensorFlowLiteC -- 319342653 by A. Unique TensorFlower<gardener@tensorflow.org>: Fix the relationship between tpu_executor and tpu_executor_base build targets. -- 319342578 by A. Unique TensorFlower<gardener@tensorflow.org>: Internal change 319340968 by A. Unique TensorFlower<gardener@tensorflow.org>: Internal change PiperOrigin-RevId: 319411158
450 lines
16 KiB
C++
450 lines
16 KiB
C++
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
==============================================================================*/
|
|
#include <fstream>
|
|
#include <iostream>
|
|
#include <memory>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#include <gmock/gmock.h>
|
|
#include <gtest/gtest.h>
|
|
#include "absl/algorithm/algorithm.h"
|
|
#include "absl/memory/memory.h"
|
|
#include "absl/strings/str_format.h"
|
|
#include "tensorflow/lite/c/common.h"
|
|
#include "tensorflow/lite/interpreter.h"
|
|
#include "tensorflow/lite/string_util.h"
|
|
#include "tensorflow/lite/testing/util.h"
|
|
#include "tensorflow/lite/tools/benchmark/benchmark_performance_options.h"
|
|
#include "tensorflow/lite/tools/benchmark/benchmark_tflite_model.h"
|
|
#include "tensorflow/lite/tools/command_line_flags.h"
|
|
#include "tensorflow/lite/tools/delegates/delegate_provider.h"
|
|
#include "tensorflow/lite/tools/logging.h"
|
|
|
|
namespace {
|
|
const std::string* g_fp32_model_path = nullptr;
|
|
const std::string* g_int8_model_path = nullptr;
|
|
const std::string* g_string_model_path = nullptr;
|
|
} // namespace
|
|
|
|
namespace tflite {
|
|
namespace benchmark {
|
|
namespace {
|
|
|
|
enum class ModelGraphType { FP32, INT8, STRING };
|
|
|
|
BenchmarkParams CreateParams(int32_t num_runs, float min_secs, float max_secs,
|
|
ModelGraphType graph_type = ModelGraphType::FP32) {
|
|
BenchmarkParams params = BenchmarkTfLiteModel::DefaultParams();
|
|
params.Set<int32_t>("num_runs", num_runs);
|
|
params.Set<float>("min_secs", min_secs);
|
|
params.Set<float>("max_secs", max_secs);
|
|
|
|
if (graph_type == ModelGraphType::INT8) {
|
|
params.Set<std::string>("graph", *g_int8_model_path);
|
|
} else if (graph_type == ModelGraphType::STRING) {
|
|
params.Set<std::string>("graph", *g_string_model_path);
|
|
} else {
|
|
// by default, simply use the fp32 one.
|
|
params.Set<std::string>("graph", *g_fp32_model_path);
|
|
}
|
|
return params;
|
|
}
|
|
|
|
BenchmarkParams CreateParams() { return CreateParams(2, 1.0f, 150.0f); }
|
|
BenchmarkParams CreateFp32Params() {
|
|
return CreateParams(2, 1.0f, 150.0f, ModelGraphType::FP32);
|
|
}
|
|
BenchmarkParams CreateInt8Params() {
|
|
return CreateParams(2, 1.0f, 150.0f, ModelGraphType::INT8);
|
|
}
|
|
BenchmarkParams CreateStringParams() {
|
|
return CreateParams(2, 1.0f, 150.0f, ModelGraphType::STRING);
|
|
}
|
|
|
|
std::string CreateFilePath(const std::string& file_name) {
|
|
return std::string(getenv("TEST_TMPDIR")) + file_name;
|
|
}
|
|
|
|
void WriteInputLayerValueFile(const std::string& file_path,
|
|
ModelGraphType graph_type, int num_elements,
|
|
char file_value = 'a') {
|
|
std::ofstream file(file_path);
|
|
int bytes = 0;
|
|
switch (graph_type) {
|
|
case ModelGraphType::FP32:
|
|
bytes = 4 * num_elements;
|
|
break;
|
|
case ModelGraphType::INT8:
|
|
bytes = num_elements;
|
|
break;
|
|
default:
|
|
LOG(WARNING) << absl::StrFormat(
|
|
"ModelGraphType(enum_value:%d) is not known.", graph_type);
|
|
LOG(WARNING) << "The size of the ModelGraphType will be 1 byte in tests.";
|
|
bytes = num_elements;
|
|
break;
|
|
}
|
|
std::vector<char> buffer(bytes, file_value);
|
|
file.write(buffer.data(), bytes);
|
|
}
|
|
|
|
void CheckInputTensorValue(const TfLiteTensor* input_tensor,
|
|
char expected_value) {
|
|
ASSERT_THAT(input_tensor, testing::NotNull());
|
|
EXPECT_TRUE(std::all_of(
|
|
input_tensor->data.raw, input_tensor->data.raw + input_tensor->bytes,
|
|
[expected_value](char c) { return c == expected_value; }));
|
|
}
|
|
|
|
void CheckInputTensorValue(const TfLiteTensor* input_tensor,
|
|
int tensor_dim_index,
|
|
const std::string& expected_value) {
|
|
StringRef tensor_value = GetString(input_tensor, tensor_dim_index);
|
|
EXPECT_TRUE(absl::equal(tensor_value.str, tensor_value.str + tensor_value.len,
|
|
expected_value.c_str(),
|
|
expected_value.c_str() + expected_value.length()));
|
|
}
|
|
|
|
class TestBenchmark : public BenchmarkTfLiteModel {
|
|
public:
|
|
explicit TestBenchmark(BenchmarkParams params)
|
|
: BenchmarkTfLiteModel(std::move(params)) {}
|
|
const tflite::Interpreter* GetInterpreter() { return interpreter_.get(); }
|
|
|
|
void Prepare() {
|
|
PrepareInputData();
|
|
ResetInputsAndOutputs();
|
|
}
|
|
|
|
const TfLiteTensor* GetInputTensor(int index) {
|
|
return index >= interpreter_->inputs().size()
|
|
? nullptr
|
|
: interpreter_->input_tensor(index);
|
|
}
|
|
};
|
|
|
|
TEST(BenchmarkTest, DoesntCrashFp32Model) {
|
|
ASSERT_THAT(g_fp32_model_path, testing::NotNull());
|
|
|
|
TestBenchmark benchmark(CreateFp32Params());
|
|
benchmark.Run();
|
|
}
|
|
|
|
TEST(BenchmarkTest, DoesntCrashInt8Model) {
|
|
ASSERT_THAT(g_int8_model_path, testing::NotNull());
|
|
|
|
TestBenchmark benchmark(CreateInt8Params());
|
|
benchmark.Run();
|
|
}
|
|
|
|
TEST(BenchmarkTest, DoesntCrashStringModel) {
|
|
ASSERT_THAT(g_int8_model_path, testing::NotNull());
|
|
|
|
TestBenchmark benchmark(CreateStringParams());
|
|
benchmark.Run();
|
|
}
|
|
|
|
class TestMultiRunStatsRecorder : public MultiRunStatsRecorder {
|
|
public:
|
|
void OutputStats() override {
|
|
MultiRunStatsRecorder::OutputStats();
|
|
|
|
// Check results have been sorted according to avg. latency in increasing
|
|
// order, and the incomplete runs are at the back of the results.
|
|
double pre_avg_latency = -1e6;
|
|
bool has_incomplete = false; // ensure complete/incomplete are not mixed.
|
|
for (const auto& result : results_) {
|
|
const auto current_avg_latency = result.metrics.inference_time_us().avg();
|
|
if (result.completed) {
|
|
EXPECT_GE(current_avg_latency, pre_avg_latency);
|
|
EXPECT_FALSE(has_incomplete);
|
|
} else {
|
|
EXPECT_EQ(0, result.metrics.inference_time_us().count());
|
|
has_incomplete = true;
|
|
}
|
|
pre_avg_latency = current_avg_latency;
|
|
}
|
|
}
|
|
};
|
|
|
|
TEST(BenchmarkTest, DoesntCrashMultiPerfOptions) {
|
|
ASSERT_THAT(g_fp32_model_path, testing::NotNull());
|
|
|
|
TestBenchmark benchmark(CreateFp32Params());
|
|
BenchmarkPerformanceOptions all_options_benchmark(
|
|
&benchmark, absl::make_unique<TestMultiRunStatsRecorder>());
|
|
all_options_benchmark.Run();
|
|
}
|
|
|
|
TEST(BenchmarkTest, DoesntCrashMultiPerfOptionsWithProfiling) {
|
|
ASSERT_THAT(g_fp32_model_path, testing::NotNull());
|
|
|
|
BenchmarkParams params = CreateFp32Params();
|
|
params.Set<bool>("enable_op_profiling", true);
|
|
TestBenchmark benchmark(std::move(params));
|
|
BenchmarkPerformanceOptions all_options_benchmark(&benchmark);
|
|
all_options_benchmark.Run();
|
|
}
|
|
|
|
TEST(BenchmarkTest, DoesntCrashWithExplicitInputFp32Model) {
|
|
ASSERT_THAT(g_fp32_model_path, testing::NotNull());
|
|
|
|
// Note: the following input-related params are *specific* to model
|
|
// 'g_fp32_model_path' which is specified as 'lite:testdata/multi_add.bin for
|
|
// the test.
|
|
BenchmarkParams params = CreateFp32Params();
|
|
params.Set<std::string>("input_layer", "a,b,c,d");
|
|
params.Set<std::string>("input_layer_shape",
|
|
"1,8,8,3:1,8,8,3:1,8,8,3:1,8,8,3");
|
|
params.Set<std::string>("input_layer_value_range", "d,1,10:b,0,100");
|
|
TestBenchmark benchmark(std::move(params));
|
|
benchmark.Run();
|
|
}
|
|
|
|
TEST(BenchmarkTest, DoesntCrashWithExplicitInputInt8Model) {
|
|
ASSERT_THAT(g_int8_model_path, testing::NotNull());
|
|
|
|
// Note: the following input-related params are *specific* to model
|
|
// 'g_int8_model_path' which is specified as
|
|
// 'lite:testdata/add_quantized_int8.bin for the test.
|
|
int a_min = 1;
|
|
int a_max = 10;
|
|
BenchmarkParams params = CreateInt8Params();
|
|
params.Set<std::string>("input_layer", "a");
|
|
params.Set<std::string>("input_layer_shape", "1,8,8,3");
|
|
params.Set<std::string>("input_layer_value_range",
|
|
absl::StrFormat("a,%d,%d", a_min, a_max));
|
|
TestBenchmark benchmark(std::move(params));
|
|
benchmark.Run();
|
|
|
|
auto input_tensor = benchmark.GetInputTensor(0);
|
|
ASSERT_THAT(input_tensor, testing::NotNull());
|
|
EXPECT_TRUE(std::all_of(
|
|
input_tensor->data.raw, input_tensor->data.raw + input_tensor->bytes,
|
|
[a_min, a_max](int i) { return a_min <= i && i <= a_max; }));
|
|
}
|
|
|
|
TEST(BenchmarkTest, DoesntCrashWithExplicitInputValueFilesFp32Model) {
|
|
ASSERT_THAT(g_fp32_model_path, testing::NotNull());
|
|
char file_value_b = 'b';
|
|
const std::string file_path_b = CreateFilePath("fp32_binary_b");
|
|
WriteInputLayerValueFile(file_path_b, ModelGraphType::FP32, 192,
|
|
file_value_b);
|
|
char file_value_d = 'd';
|
|
const std::string file_path_d = CreateFilePath("fp32_binary_d");
|
|
WriteInputLayerValueFile(file_path_d, ModelGraphType::FP32, 192,
|
|
file_value_d);
|
|
|
|
// Note: the following input-related params are *specific* to model
|
|
// 'g_fp32_model_path' which is specified as 'lite:testdata/multi_add.bin for
|
|
// the test.
|
|
BenchmarkParams params = CreateFp32Params();
|
|
params.Set<std::string>("input_layer", "a,b,c,d");
|
|
params.Set<std::string>("input_layer_shape",
|
|
"1,8,8,3:1,8,8,3:1,8,8,3:1,8,8,3");
|
|
params.Set<std::string>("input_layer_value_files",
|
|
"d:" + file_path_d + ",b:" + file_path_b);
|
|
TestBenchmark benchmark(std::move(params));
|
|
benchmark.Run();
|
|
|
|
CheckInputTensorValue(benchmark.GetInputTensor(1), file_value_b);
|
|
CheckInputTensorValue(benchmark.GetInputTensor(3), file_value_d);
|
|
}
|
|
|
|
TEST(BenchmarkTest, DoesntCrashWithExplicitInputValueFilesInt8Model) {
|
|
ASSERT_THAT(g_int8_model_path, testing::NotNull());
|
|
const std::string file_path = CreateFilePath("int8_binary");
|
|
char file_value = 'a';
|
|
WriteInputLayerValueFile(file_path, ModelGraphType::INT8, 192, file_value);
|
|
|
|
// Note: the following input-related params are *specific* to model
|
|
// 'g_int8_model_path' which is specified as
|
|
// 'lite:testdata/add_quantized_int8.bin for the test.
|
|
BenchmarkParams params = CreateInt8Params();
|
|
params.Set<std::string>("input_layer", "a");
|
|
params.Set<std::string>("input_layer_shape", "1,8,8,3");
|
|
params.Set<std::string>("input_layer_value_files", "a:" + file_path);
|
|
TestBenchmark benchmark(std::move(params));
|
|
benchmark.Run();
|
|
|
|
CheckInputTensorValue(benchmark.GetInputTensor(0), file_value);
|
|
}
|
|
|
|
TEST(BenchmarkTest, DoesntCrashWithExplicitInputValueFilesStringModel) {
|
|
ASSERT_THAT(g_string_model_path, testing::NotNull());
|
|
const std::string file_path = CreateFilePath("string_binary");
|
|
const std::string string_value_0 = "abcd";
|
|
const std::string string_value_1 = "12345";
|
|
const std::string string_value_2 = "a1b2c3d4e5";
|
|
std::ofstream file(file_path);
|
|
// Store the terminating null-character ('\0') at the end of the returned
|
|
// value by std::string::c_str().
|
|
file.write(string_value_0.c_str(), string_value_0.length() + 1);
|
|
file.write(string_value_1.c_str(), string_value_1.length() + 1);
|
|
file.write(string_value_2.c_str(), string_value_2.length() + 1);
|
|
file.close();
|
|
|
|
// Note: the following input-related params are *specific* to model
|
|
// 'g_string_model_path' which is specified as
|
|
// 'lite:testdata/string_input_model.bin for the test.
|
|
BenchmarkParams params = CreateStringParams();
|
|
params.Set<std::string>("input_layer", "a");
|
|
params.Set<std::string>("input_layer_shape", "1,3");
|
|
params.Set<std::string>("input_layer_value_files", "a:" + file_path);
|
|
TestBenchmark benchmark(std::move(params));
|
|
benchmark.Run();
|
|
|
|
auto input_tensor = benchmark.GetInputTensor(0);
|
|
ASSERT_THAT(input_tensor, testing::NotNull());
|
|
EXPECT_EQ(GetStringCount(input_tensor), 3);
|
|
CheckInputTensorValue(input_tensor, 0, string_value_0);
|
|
CheckInputTensorValue(input_tensor, 1, string_value_1);
|
|
CheckInputTensorValue(input_tensor, 2, string_value_2);
|
|
}
|
|
|
|
class ScopedCommandlineArgs {
|
|
public:
|
|
explicit ScopedCommandlineArgs(const std::vector<std::string>& actual_args) {
|
|
argc_ = actual_args.size() + 1;
|
|
argv_ = new char*[argc_];
|
|
const std::string program_name = "benchmark_model";
|
|
int buffer_size = program_name.length() + 1;
|
|
for (const auto& arg : actual_args) buffer_size += arg.length() + 1;
|
|
buffer_ = new char[buffer_size];
|
|
auto next_start = program_name.copy(buffer_, program_name.length());
|
|
buffer_[next_start++] = '\0';
|
|
argv_[0] = buffer_;
|
|
for (int i = 0; i < actual_args.size(); ++i) {
|
|
const auto& arg = actual_args[i];
|
|
argv_[i + 1] = buffer_ + next_start;
|
|
next_start += arg.copy(argv_[i + 1], arg.length());
|
|
buffer_[next_start++] = '\0';
|
|
}
|
|
}
|
|
~ScopedCommandlineArgs() {
|
|
delete[] argv_;
|
|
delete[] buffer_;
|
|
}
|
|
|
|
int argc() const { return argc_; }
|
|
|
|
char** argv() const { return argv_; }
|
|
|
|
private:
|
|
char* buffer_; // the buffer for all arguments.
|
|
int argc_;
|
|
char** argv_; // Each char* element points to each argument.
|
|
};
|
|
|
|
TEST(BenchmarkTest, RunWithCorrectFlags) {
|
|
ASSERT_THAT(g_fp32_model_path, testing::NotNull());
|
|
TestBenchmark benchmark(CreateFp32Params());
|
|
ScopedCommandlineArgs scoped_argv({"--num_threads=4"});
|
|
auto status = benchmark.Run(scoped_argv.argc(), scoped_argv.argv());
|
|
EXPECT_EQ(kTfLiteOk, status);
|
|
}
|
|
|
|
TEST(BenchmarkTest, RunWithWrongFlags) {
|
|
ASSERT_THAT(g_fp32_model_path, testing::NotNull());
|
|
TestBenchmark benchmark(CreateFp32Params());
|
|
ScopedCommandlineArgs scoped_argv({"--num_threads=str"});
|
|
auto status = benchmark.Run(scoped_argv.argc(), scoped_argv.argv());
|
|
EXPECT_EQ(kTfLiteError, status);
|
|
}
|
|
|
|
TEST(BenchmarkTest, RunWithUseCaching) {
|
|
ASSERT_THAT(g_fp32_model_path, testing::NotNull());
|
|
TestBenchmark benchmark(CreateFp32Params());
|
|
ScopedCommandlineArgs scoped_argv({"--use_caching=false"});
|
|
auto status = benchmark.Run(scoped_argv.argc(), scoped_argv.argv());
|
|
EXPECT_EQ(kTfLiteOk, status);
|
|
}
|
|
|
|
class MaxDurationWorksTestListener : public BenchmarkListener {
|
|
void OnBenchmarkEnd(const BenchmarkResults& results) override {
|
|
const int64_t num_actual_runs = results.inference_time_us().count();
|
|
TFLITE_LOG(INFO) << "number of actual runs: " << num_actual_runs;
|
|
EXPECT_GE(num_actual_runs, 1);
|
|
EXPECT_LT(num_actual_runs, 100000000);
|
|
}
|
|
};
|
|
|
|
TEST(BenchmarkTest, MaxDurationWorks) {
|
|
ASSERT_THAT(g_fp32_model_path, testing::NotNull());
|
|
TestBenchmark benchmark(CreateParams(100000000 /* num_runs */,
|
|
1000000.0f /* min_secs */,
|
|
0.001f /* max_secs */));
|
|
MaxDurationWorksTestListener listener;
|
|
benchmark.AddListener(&listener);
|
|
benchmark.Run();
|
|
}
|
|
|
|
TEST(BenchmarkTest, ParametersArePopulatedWhenInputShapeIsNotSpecified) {
|
|
ASSERT_THAT(g_fp32_model_path, testing::NotNull());
|
|
|
|
TestBenchmark benchmark(CreateParams());
|
|
benchmark.Init();
|
|
benchmark.Prepare();
|
|
|
|
auto interpreter = benchmark.GetInterpreter();
|
|
auto inputs = interpreter->inputs();
|
|
ASSERT_GE(inputs.size(), 1);
|
|
auto input_tensor = interpreter->tensor(inputs[0]);
|
|
|
|
// Copy input tensor to a vector
|
|
std::vector<char> input_bytes(input_tensor->data.raw,
|
|
input_tensor->data.raw + input_tensor->bytes);
|
|
|
|
benchmark.Prepare();
|
|
|
|
// Expect data is not the same.
|
|
EXPECT_EQ(input_bytes.size(), input_tensor->bytes);
|
|
EXPECT_FALSE(absl::equal(input_bytes.begin(), input_bytes.end(),
|
|
input_tensor->data.raw,
|
|
input_tensor->data.raw + input_tensor->bytes));
|
|
}
|
|
|
|
} // namespace
|
|
} // namespace benchmark
|
|
} // namespace tflite
|
|
|
|
int main(int argc, char** argv) {
|
|
std::string fp32_model_path, int8_model_path, string_model_path;
|
|
std::vector<tflite::Flag> flags = {
|
|
tflite::Flag::CreateFlag("fp32_graph", &fp32_model_path,
|
|
"Path to a fp32 model file."),
|
|
tflite::Flag::CreateFlag("int8_graph", &int8_model_path,
|
|
"Path to a int8 model file."),
|
|
tflite::Flag::CreateFlag("string_graph", &string_model_path,
|
|
"Path to a string model file."),
|
|
};
|
|
|
|
g_fp32_model_path = &fp32_model_path;
|
|
g_int8_model_path = &int8_model_path;
|
|
g_string_model_path = &string_model_path;
|
|
|
|
const bool parse_result =
|
|
tflite::Flags::Parse(&argc, const_cast<const char**>(argv), flags);
|
|
if (!parse_result) {
|
|
std::cerr << tflite::Flags::Usage(argv[0], flags);
|
|
return 1;
|
|
}
|
|
|
|
::tflite::LogToStderr();
|
|
::testing::InitGoogleTest(&argc, argv);
|
|
return RUN_ALL_TESTS();
|
|
}
|