So far, when multiple threads needed to pack a LHS or RHS block that hadn't been packed yet, both could concurrently do the packing work. This was OK as any concurrent writes would be writing the same values, but was still the reason why packing code required relaxed atomic stores. The upside was that our packing code never blocked. Requiring relaxed atomic stores was a downside in its own right, already from a generic source code complexity perspective, and more so in the context of architecture-specific packing code paths that might need to use SIMD intrinsics, for which there might be no relaxed-atomic version. Work-arounds were available but further added to the source-code-complexity concern. Meanwhile, a naive blocking version of this code was found to perform just as well in most cases and only slightly worse in a worst-case of multithreading across CPU cores of very different speeds. This CL goes a little farther than just a naive blocking version. It also takes advantage of the observation that if a thread finds that it needs to pack a LHS block that is already being packed by another thread, then instead of just waiting for that, it can instead be packing the RHS block, and conversely. This should reduce the amount of time spent waiting. A further CL is expected to go farther in letting threads do other packing work, e.g. moving on to the next few blocks, which would typically be used in the near future. Regardless of specifics, the fact that we have these different tricks to play to reduce overhead as needed, makes me confident that this potentially blocking approach is the right one. PiperOrigin-RevId: 259994760
103 lines
4.2 KiB
C++
103 lines
4.2 KiB
C++
/* Copyright 2019 Google LLC. All Rights Reserved.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
==============================================================================*/
|
|
|
|
// Implementation of low-level pre-packing API.
|
|
|
|
#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_PREPACK_H_
|
|
#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_PREPACK_H_
|
|
|
|
#include <functional>
|
|
|
|
#include "tensorflow/lite/experimental/ruy/context.h"
|
|
#include "tensorflow/lite/experimental/ruy/dispatch.h"
|
|
#include "tensorflow/lite/experimental/ruy/matrix.h"
|
|
#include "tensorflow/lite/experimental/ruy/path.h"
|
|
#include "tensorflow/lite/experimental/ruy/side_pair.h"
|
|
#include "tensorflow/lite/experimental/ruy/spec.h"
|
|
#include "tensorflow/lite/experimental/ruy/tune.h"
|
|
|
|
namespace ruy {
|
|
|
|
template <Path CompiledPaths, typename LhsScalar, typename RhsScalar,
|
|
typename DstScalar, typename Spec>
|
|
void PrePackForMulInternal(const Matrix<LhsScalar>& lhs,
|
|
const Matrix<RhsScalar>& rhs, const Spec& spec,
|
|
Context* context, Matrix<DstScalar>* dst,
|
|
SidePair<PrepackedMatrix*> prepacked,
|
|
std::function<void*(std::size_t)> alloc_fn) {
|
|
gemmlowp::ScopedProfilingLabel label("PrePackForMul");
|
|
Path the_path = context->GetPathToTake<CompiledPaths>();
|
|
RUY_CHECK(the_path != Path::kReference);
|
|
constexpr Path TrMulCompiledPaths = CompiledPaths & ~Path::kReference;
|
|
Matrix<LhsScalar> transposed_lhs(lhs);
|
|
Transpose(&transposed_lhs);
|
|
TrMulParams params;
|
|
CreateTrMulParams<TrMulCompiledPaths>(transposed_lhs, rhs, spec, context, dst,
|
|
the_path, ¶ms);
|
|
|
|
const SidePair<int> origin{0, 0};
|
|
const SidePair<int> rounded_dims{params.packed[Side::kLhs].layout.cols,
|
|
params.packed[Side::kRhs].layout.cols};
|
|
|
|
Tuning tuning = context->GetMainThreadTuning();
|
|
for (Side side : {Side::kLhs, Side::kRhs}) {
|
|
if (prepacked[side]) {
|
|
prepacked[side]->data_size = DataSize(params.packed[side]);
|
|
prepacked[side]->sums_size = SumsSize(params.packed[side]);
|
|
prepacked[side]->data = alloc_fn(prepacked[side]->data_size);
|
|
prepacked[side]->sums = alloc_fn(prepacked[side]->sums_size);
|
|
params.packed[side].data = prepacked[side]->data;
|
|
params.packed[side].sums = prepacked[side]->sums;
|
|
params.RunPack(side, tuning, origin[side], rounded_dims[side]);
|
|
}
|
|
}
|
|
}
|
|
|
|
template <Path CompiledPaths, typename LhsScalar, typename RhsScalar,
|
|
typename DstScalar, typename Spec>
|
|
void MulWithPrepackedInternal(const Matrix<LhsScalar>& lhs,
|
|
const Matrix<RhsScalar>& rhs, const Spec& spec,
|
|
Context* context, Matrix<DstScalar>* dst,
|
|
SidePair<PrepackedMatrix*> prepacked) {
|
|
gemmlowp::ScopedProfilingLabel label("MulWithPrepacked");
|
|
|
|
EnforceLayoutSupport<Spec>(lhs.layout, rhs.layout, dst->layout);
|
|
EnforceZeroPointSupport<Spec>(lhs.zero_point, rhs.zero_point,
|
|
dst->zero_point);
|
|
|
|
Path the_path = context->GetPathToTake<CompiledPaths>();
|
|
RUY_CHECK(the_path != Path::kReference);
|
|
constexpr Path TrMulCompiledPaths = CompiledPaths & ~Path::kReference;
|
|
Matrix<LhsScalar> transposed_lhs(lhs);
|
|
Transpose(&transposed_lhs);
|
|
TrMulParams params;
|
|
CreateTrMulParams<TrMulCompiledPaths>(transposed_lhs, rhs, spec, context, dst,
|
|
the_path, ¶ms);
|
|
|
|
for (Side side : {Side::kLhs, Side::kRhs}) {
|
|
if (prepacked[side]) {
|
|
params.packed[side].data = prepacked[side]->data;
|
|
params.packed[side].sums = prepacked[side]->sums;
|
|
params.is_prepacked[side] = true;
|
|
}
|
|
}
|
|
|
|
TrMul(¶ms, context);
|
|
}
|
|
|
|
} // namespace ruy
|
|
|
|
#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_PREPACK_H_
|