From 4c674a64c8f5d6850c2907d5686c9c10bea55b33 Mon Sep 17 00:00:00 2001 From: Renjie Liu Date: Thu, 28 May 2020 19:51:43 -0700 Subject: [PATCH] Migrate float/uint8 broadcast fivefold add to use binary broadcast fivefold. PiperOrigin-RevId: 313707010 Change-Id: I800cbc4406bad709cedecbfa0e41b4e465254f75 --- .../internal/optimized/optimized_ops.h | 402 ++++++------------ .../reference/process_broadcast_shapes.h | 4 + tensorflow/lite/kernels/sub.cc | 10 +- 3 files changed, 134 insertions(+), 282 deletions(-) diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h index 746ed622632..6f478daab68 100644 --- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h +++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h @@ -234,6 +234,100 @@ inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows( } #endif +template +inline void BinaryBroadcastFiveFold(const ArithmeticParams& unswitched_params, + const RuntimeShape& unswitched_input1_shape, + const T* unswitched_input1_data, + const RuntimeShape& unswitched_input2_shape, + const T* unswitched_input2_data, + const RuntimeShape& output_shape, + T* output_data, ElementwiseF elementwise_f, + ScalarBroadcastF scalar_broadcast_f) { + ArithmeticParams switched_params = unswitched_params; + switched_params.input1_offset = unswitched_params.input2_offset; + switched_params.input1_multiplier = unswitched_params.input2_multiplier; + switched_params.input1_shift = unswitched_params.input2_shift; + switched_params.input2_offset = unswitched_params.input1_offset; + switched_params.input2_multiplier = unswitched_params.input1_multiplier; + switched_params.input2_shift = unswitched_params.input1_shift; + + const bool use_unswitched = + unswitched_params.broadcast_category == + tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast; + + const ArithmeticParams& params = + use_unswitched ? unswitched_params : switched_params; + const T* input1_data = + use_unswitched ? unswitched_input1_data : unswitched_input2_data; + const T* input2_data = + use_unswitched ? unswitched_input2_data : unswitched_input1_data; + + // Fivefold nested loops. The second input resets its position for each + // iteration of the second loop. The first input resets its position at the + // beginning of the fourth loop. The innermost loop is an elementwise add of + // sections of the arrays. + T* output_data_ptr = output_data; + const T* input1_data_ptr = input1_data; + const T* input2_data_reset = input2_data; + // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared + // between input shapes. y3 for input 1 is always broadcast, and so the + // dimension there is 1, whereas optionally y1 might be broadcast for + // input 2. Put another way, input1.shape.FlatSize = y0 * y1 * y2 * y4, + // input2.shape.FlatSize = y0 * y2 * y3 * y4. + int y0 = params.broadcast_shape[0]; + int y1 = params.broadcast_shape[1]; + int y2 = params.broadcast_shape[2]; + int y3 = params.broadcast_shape[3]; + int y4 = params.broadcast_shape[4]; + if (y4 > 1) { + // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner + // dimension. + for (int i0 = 0; i0 < y0; ++i0) { + const T* input2_data_ptr = nullptr; + for (int i1 = 0; i1 < y1; ++i1) { + input2_data_ptr = input2_data_reset; + for (int i2 = 0; i2 < y2; ++i2) { + for (int i3 = 0; i3 < y3; ++i3) { + elementwise_f(y4, params, input1_data_ptr, input2_data_ptr, + output_data_ptr); + input2_data_ptr += y4; + output_data_ptr += y4; + } + // We have broadcast y4 of input1 data y3 times, and now move on. + input1_data_ptr += y4; + } + } + // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on. + input2_data_reset = input2_data_ptr; + } + } else { + // Special case of y4 == 1, in which the innermost loop is a single + // element and can be combined with the next (y3) as an inner broadcast. + // + // Note that this handles the case of pure scalar broadcast when + // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar + // broadcast with batch (as y2 > 1). + // + // NOTE The process is the same as the above general case except + // simplified for y4 == 1 and the loop over y3 is contained within the + // AddScalarBroadcast function. + for (int i0 = 0; i0 < y0; ++i0) { + const T* input2_data_ptr = nullptr; + for (int i1 = 0; i1 < y1; ++i1) { + input2_data_ptr = input2_data_reset; + for (int i2 = 0; i2 < y2; ++i2) { + scalar_broadcast_f(y3, params, *input1_data_ptr, input2_data_ptr, + output_data_ptr); + input2_data_ptr += y3; + output_data_ptr += y3; + input1_data_ptr += 1; + } + } + input2_data_reset = input2_data_ptr; + } + } +} + inline void AddBiasAndEvalActivationFunction(float output_activation_min, float output_activation_max, const RuntimeShape& bias_shape, @@ -2073,186 +2167,6 @@ inline void Add(const ArithmeticParams& params, output_map = output_map.cwiseMin(params.quantized_activation_max); } -inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params, - const RuntimeShape& unswitched_input1_shape, - const uint8* unswitched_input1_data, - const RuntimeShape& unswitched_input2_shape, - const uint8* unswitched_input2_data, - const RuntimeShape& output_shape, - uint8* output_data) { - ruy::profiler::ScopeLabel label("BroadcastAddFivefold/8bit"); - - ArithmeticParams switched_params = unswitched_params; - switched_params.input1_offset = unswitched_params.input2_offset; - switched_params.input1_multiplier = unswitched_params.input2_multiplier; - switched_params.input1_shift = unswitched_params.input2_shift; - switched_params.input2_offset = unswitched_params.input1_offset; - switched_params.input2_multiplier = unswitched_params.input1_multiplier; - switched_params.input2_shift = unswitched_params.input1_shift; - - const bool use_unswitched = - unswitched_params.broadcast_category == - tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast; - - const ArithmeticParams& params = - use_unswitched ? unswitched_params : switched_params; - const uint8* input1_data = - use_unswitched ? unswitched_input1_data : unswitched_input2_data; - const uint8* input2_data = - use_unswitched ? unswitched_input2_data : unswitched_input1_data; - - // Fivefold nested loops. The second input resets its position for each - // iteration of the second loop. The first input resets its position at the - // beginning of the fourth loop. The innermost loop is an elementwise add of - // sections of the arrays. - uint8* output_data_ptr = output_data; - const uint8* input1_data_ptr = input1_data; - const uint8* input2_data_reset = input2_data; - // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared - // between input shapes. y3 for input 1 is always broadcast, and so the - // dimension there is 1, whereas optionally y1 might be broadcast for input 2. - // Put another way, - // input1.shape.FlatSize = y0 * y1 * y2 * y4, - // input2.shape.FlatSize = y0 * y2 * y3 * y4. - int y0 = params.broadcast_shape[0]; - int y1 = params.broadcast_shape[1]; - int y2 = params.broadcast_shape[2]; - int y3 = params.broadcast_shape[3]; - int y4 = params.broadcast_shape[4]; - if (y4 > 1) { - // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner - // dimension. - for (int i0 = 0; i0 < y0; ++i0) { - const uint8* input2_data_ptr = nullptr; - for (int i1 = 0; i1 < y1; ++i1) { - input2_data_ptr = input2_data_reset; - for (int i2 = 0; i2 < y2; ++i2) { - for (int i3 = 0; i3 < y3; ++i3) { - AddElementwise(y4, params, input1_data_ptr, input2_data_ptr, - output_data_ptr); - input2_data_ptr += y4; - output_data_ptr += y4; - } - // We have broadcast y4 of input1 data y3 times, and now move on. - input1_data_ptr += y4; - } - } - // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on. - input2_data_reset = input2_data_ptr; - } - } else { - // Special case of y4 == 1, in which the innermost loop is a single element - // and can be combined with the next (y3) as an inner broadcast. - // - // Note that this handles the case of pure scalar broadcast when - // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar - // broadcast with batch (as y2 > 1). - // - // NOTE The process is the same as the above general case except simplified - // for y4 == 1 and the loop over y3 is contained within the - // AddScalarBroadcast function. - for (int i0 = 0; i0 < y0; ++i0) { - const uint8* input2_data_ptr = nullptr; - for (int i1 = 0; i1 < y1; ++i1) { - input2_data_ptr = input2_data_reset; - for (int i2 = 0; i2 < y2; ++i2) { - AddScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr, - output_data_ptr); - input2_data_ptr += y3; - output_data_ptr += y3; - input1_data_ptr += 1; - } - } - input2_data_reset = input2_data_ptr; - } - } -} - -inline void BroadcastAddFivefold(const ArithmeticParams& params, - const RuntimeShape& unswitched_input1_shape, - const float* unswitched_input1_data, - const RuntimeShape& unswitched_input2_shape, - const float* unswitched_input2_data, - const RuntimeShape& output_shape, - float* output_data) { - ruy::profiler::ScopeLabel label("BroadcastAddFivefold/float"); - - const bool use_unswitched = - params.broadcast_category == - tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast; - - const float* input1_data = - use_unswitched ? unswitched_input1_data : unswitched_input2_data; - const float* input2_data = - use_unswitched ? unswitched_input2_data : unswitched_input1_data; - - // Fivefold nested loops. The second input resets its position for each - // iteration of the second loop. The first input resets its position at the - // beginning of the fourth loop. The innermost loop is an elementwise add of - // sections of the arrays. - float* output_data_ptr = output_data; - const float* input1_data_ptr = input1_data; - const float* input2_data_reset = input2_data; - // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared - // between input shapes. y3 for input 1 is always broadcast, and so the - // dimension there is 1, whereas optionally y1 might be broadcast for input 2. - // Put another way, - // input1.shape.FlatSize = y0 * y1 * y2 * y4, - // input2.shape.FlatSize = y0 * y2 * y3 * y4. - int y0 = params.broadcast_shape[0]; - int y1 = params.broadcast_shape[1]; - int y2 = params.broadcast_shape[2]; - int y3 = params.broadcast_shape[3]; - int y4 = params.broadcast_shape[4]; - if (y4 > 1) { - // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner - // dimension. - for (int i0 = 0; i0 < y0; ++i0) { - const float* input2_data_ptr = nullptr; - for (int i1 = 0; i1 < y1; ++i1) { - input2_data_ptr = input2_data_reset; - for (int i2 = 0; i2 < y2; ++i2) { - for (int i3 = 0; i3 < y3; ++i3) { - AddElementwise(y4, params, input1_data_ptr, input2_data_ptr, - output_data_ptr); - input2_data_ptr += y4; - output_data_ptr += y4; - } - // We have broadcast y4 of input1 data y3 times, and now move on. - input1_data_ptr += y4; - } - } - // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on. - input2_data_reset = input2_data_ptr; - } - } else { - // Special case of y4 == 1, in which the innermost loop is a single element - // and can be combined with the next (y3) as an inner broadcast. - // - // Note that this handles the case of pure scalar broadcast when - // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar - // broadcast with batch (as y2 > 1). - // - // NOTE The process is the same as the above general case except simplified - // for y4 == 1 and the loop over y3 is contained within the - // AddScalarBroadcast function. - for (int i0 = 0; i0 < y0; ++i0) { - const float* input2_data_ptr = nullptr; - for (int i1 = 0; i1 < y1; ++i1) { - input2_data_ptr = input2_data_reset; - for (int i2 = 0; i2 < y2; ++i2) { - AddScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr, - output_data_ptr); - input2_data_ptr += y3; - output_data_ptr += y3; - input1_data_ptr += 1; - } - } - input2_data_reset = input2_data_ptr; - } - } -} - template inline void BroadcastAddDispatch( const ArithmeticParams& params, const RuntimeShape& input1_shape, @@ -2263,8 +2177,37 @@ inline void BroadcastAddDispatch( input2_data, output_shape, output_data); } - BroadcastAddFivefold(params, input1_shape, input1_data, input2_shape, - input2_data, output_shape, output_data); + BinaryBroadcastFiveFold( + params, input1_shape, input1_data, input2_shape, input2_data, + output_shape, output_data, + static_cast(AddElementwise), + static_cast( + AddScalarBroadcast)); +} + +inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params, + const RuntimeShape& unswitched_input1_shape, + const uint8* unswitched_input1_data, + const RuntimeShape& unswitched_input2_shape, + const uint8* unswitched_input2_data, + const RuntimeShape& output_shape, + uint8* output_data) { + BroadcastAddDispatch(unswitched_params, unswitched_input1_shape, + unswitched_input1_data, unswitched_input2_shape, + unswitched_input2_data, output_shape, output_data); +} + +inline void BroadcastAddFivefold(const ArithmeticParams& params, + const RuntimeShape& unswitched_input1_shape, + const float* unswitched_input1_data, + const RuntimeShape& unswitched_input2_shape, + const float* unswitched_input2_data, + const RuntimeShape& output_shape, + float* output_data) { + BroadcastAddDispatch(params, unswitched_input1_shape, unswitched_input1_data, + unswitched_input2_shape, unswitched_input2_data, + output_shape, output_data); } inline void MulElementwise(int size, const ArithmeticParams& params, @@ -7979,101 +7922,6 @@ inline void MinimumScalarBroadcast(int size, const ArithmeticParams& params, } } -template -inline void BinaryBroadcastFiveFold(const ArithmeticParams& unswitched_params, - const RuntimeShape& unswitched_input1_shape, - const int8* unswitched_input1_data, - const RuntimeShape& unswitched_input2_shape, - const int8* unswitched_input2_data, - const RuntimeShape& output_shape, - int8* output_data, - ElementwiseF elementwise_f, - ScalarBroadcastF scalar_broadcast_f) { - ArithmeticParams switched_params = unswitched_params; - switched_params.input1_offset = unswitched_params.input2_offset; - switched_params.input1_multiplier = unswitched_params.input2_multiplier; - switched_params.input1_shift = unswitched_params.input2_shift; - switched_params.input2_offset = unswitched_params.input1_offset; - switched_params.input2_multiplier = unswitched_params.input1_multiplier; - switched_params.input2_shift = unswitched_params.input1_shift; - - const bool use_unswitched = - unswitched_params.broadcast_category == - tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast; - - const ArithmeticParams& params = - use_unswitched ? unswitched_params : switched_params; - const int8* input1_data = - use_unswitched ? unswitched_input1_data : unswitched_input2_data; - const int8* input2_data = - use_unswitched ? unswitched_input2_data : unswitched_input1_data; - - // Fivefold nested loops. The second input resets its position for each - // iteration of the second loop. The first input resets its position at the - // beginning of the fourth loop. The innermost loop is an elementwise add of - // sections of the arrays. - int8* output_data_ptr = output_data; - const int8* input1_data_ptr = input1_data; - const int8* input2_data_reset = input2_data; - // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared - // between input shapes. y3 for input 1 is always broadcast, and so the - // dimension there is 1, whereas optionally y1 might be broadcast for - // input 2. Put another way, input1.shape.FlatSize = y0 * y1 * y2 * y4, - // input2.shape.FlatSize = y0 * y2 * y3 * y4. - int y0 = params.broadcast_shape[0]; - int y1 = params.broadcast_shape[1]; - int y2 = params.broadcast_shape[2]; - int y3 = params.broadcast_shape[3]; - int y4 = params.broadcast_shape[4]; - if (y4 > 1) { - // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner - // dimension. - for (int i0 = 0; i0 < y0; ++i0) { - const int8* input2_data_ptr = nullptr; - for (int i1 = 0; i1 < y1; ++i1) { - input2_data_ptr = input2_data_reset; - for (int i2 = 0; i2 < y2; ++i2) { - for (int i3 = 0; i3 < y3; ++i3) { - elementwise_f(y4, params, input1_data_ptr, input2_data_ptr, - output_data_ptr); - input2_data_ptr += y4; - output_data_ptr += y4; - } - // We have broadcast y4 of input1 data y3 times, and now move on. - input1_data_ptr += y4; - } - } - // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on. - input2_data_reset = input2_data_ptr; - } - } else { - // Special case of y4 == 1, in which the innermost loop is a single - // element and can be combined with the next (y3) as an inner broadcast. - // - // Note that this handles the case of pure scalar broadcast when - // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar - // broadcast with batch (as y2 > 1). - // - // NOTE The process is the same as the above general case except - // simplified for y4 == 1 and the loop over y3 is contained within the - // AddScalarBroadcast function. - for (int i0 = 0; i0 < y0; ++i0) { - const int8* input2_data_ptr = nullptr; - for (int i1 = 0; i1 < y1; ++i1) { - input2_data_ptr = input2_data_reset; - for (int i2 = 0; i2 < y2; ++i2) { - scalar_broadcast_f(y3, params, *input1_data_ptr, input2_data_ptr, - output_data_ptr); - input2_data_ptr += y3; - output_data_ptr += y3; - input1_data_ptr += 1; - } - } - input2_data_reset = input2_data_ptr; - } - } -} - template inline void BroadcastMaximumDispatch(const ArithmeticParams& params, const RuntimeShape& input1_shape, diff --git a/tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h b/tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h index 8e1a6c85919..40f779c5bdf 100644 --- a/tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h +++ b/tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h @@ -76,6 +76,10 @@ inline bool ProcessBroadcastShapes(const RuntimeShape& shape0, BroadcastableOpCategory::kFirstInputBroadcastsFast && params->broadcast_category != BroadcastableOpCategory::kSecondInputBroadcastsFast) { + // This is unreachable because at least one else clause in the above loop + // must be reached. + TFLITE_DCHECK(false); + params->broadcast_category = BroadcastableOpCategory::kNonBroadcast; return false; } diff --git a/tensorflow/lite/kernels/sub.cc b/tensorflow/lite/kernels/sub.cc index 1b04143d222..a2282a0545b 100644 --- a/tensorflow/lite/kernels/sub.cc +++ b/tensorflow/lite/kernels/sub.cc @@ -326,11 +326,11 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node, TF_LITE_SUB(reference_ops, Add, uint8_t); } } else { - if (op_params.broadcast_category == - BroadcastableOpCategory::kGenericBroadcast) { - TF_LITE_SUB(optimized_ops, BroadcastAdd4DSlow, uint8_t); - } else if (need_broadcast) { - TF_LITE_SUB(optimized_ops, BroadcastAddFivefold, uint8_t); + if (need_broadcast) { + optimized_ops::BroadcastAddDispatch( + op_params, GetTensorShape(input1), GetTensorData(input1), + GetTensorShape(input2), GetTensorData(input2), + GetTensorShape(output), GetTensorData(output)); } else { TF_LITE_SUB(optimized_ops, Add, uint8_t); }